bazel: Go Coverage Report

// Copyright 2015 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

//go:build !bazel

package bazel

// This file contains stub implementations for non-bazel builds.
// See bazel.go for full documentation on the contracts of these functions.

// BuiltWithBazel returns true iff this library was built with Bazel.
func BuiltWithBazel() bool {
        return false
}

// InBazelTest returns true iff called from a test run by Bazel.
func InBazelTest() bool {
        return false
}

// InTestWrapper returns true iff called from Bazel's generated test wrapper.
func InTestWrapper() bool {
        return false
}

// FindBinary is not implemented.
func FindBinary(pkg, name string) (string, bool) {
        panic("not build with Bazel")
}

// Runfile is not implemented.
func Runfile(string) (string, error) {
        panic("not built with Bazel")
}

// RunfilesPath is not implemented.
func RunfilesPath() (string, error) {
        panic("not built with Bazel")
}

// TestTmpDir is not implemented.
func TestTmpDir() string {
        panic("not built with Bazel")
}

// NewTmpDir is not implemented.
func NewTmpDir(prefix string) (string, error) {
        panic("not built with Bazel")
}

// RelativeTestTargetPath is not implemented.
func RelativeTestTargetPath() string {
        panic("not built with Bazel")
}

// SetGoEnv is not implemented.
func SetGoEnv() {
        panic("not built with Bazel")
}

// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package pgcryptocipherccl

import (
        "crypto/aes"
        "crypto/cipher"

        "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
        "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
        "github.com/cockroachdb/errors"
)

var (
        // ErrInvalidDataLength reports an attempt to either Encrypt or Decrypt data
        // of invalid length.
        ErrInvalidDataLength = pgerror.New(pgcode.InvalidParameterValue, "pgcryptocipherccl: invalid data length")
)

// Encrypt returns the ciphertext obtained by running the encryption
// algorithm for the specified cipher type with the provided key and
// initialization vector over the provided data.
func Encrypt(data []byte, key []byte, iv []byte, cipherType string) ([]byte, error) {
        method, err := parseCipherMethod(cipherType)
        if err != nil {
                return nil, err
        }
        block, err := newCipher(method, key)
        if err != nil {
                return nil, err
        }
        blockSize := block.BlockSize()
        data, err = padData(method, data, blockSize)
        if err != nil {
                return nil, err
        }
        err = validateDataLength(data, blockSize)
        if err != nil {
                return nil, err
        }
        return encrypt(method, block, iv, data)
}

// Decrypt returns the plaintext obtained by running the decryption
// algorithm for the specified cipher type with the provided key and
// initialization vector over the provided data.
func Decrypt(data []byte, key []byte, iv []byte, cipherType string) ([]byte, error) {
        method, err := parseCipherMethod(cipherType)
        if err != nil {
                return nil, err
        }
        block, err := newCipher(method, key)
        if err != nil {
                return nil, err
        }
        blockSize := block.BlockSize()
        err = validateDataLength(data, blockSize)
        if err != nil {
                return nil, err
        }
        data, err = decrypt(method, block, iv, data)
        if err != nil {
                return nil, err
        }
        return unpadData(method, data)
}

func newCipher(method cipherMethod, key []byte) (cipher.Block, error) {
        switch a := method.algorithm; a {
        case aesCipher:
                var err error
                switch l := len(key); {
                case l >= 32:
                        key, err = zeroPadOrTruncate(key, 32)
                case l >= 24:
                        key, err = zeroPadOrTruncate(key, 24)
                default:
                        key, err = zeroPadOrTruncate(key, 16)
                }
                if err != nil {
                        return nil, err
                }
                return aes.NewCipher(key)
        default:
                return nil, errors.AssertionFailedf("cannot create new cipher for unknown algorithm: %d", a)
        }
}

func padData(method cipherMethod, data []byte, blockSize int) ([]byte, error) {
        switch p := method.padding; p {
        case pkcsPadding:
                return pkcsPad(data, blockSize)
        case noPadding:
                return data, nil
        default:
                return nil, errors.AssertionFailedf("cannot pad for unknown padding: %d", p)
        }
}

func unpadData(method cipherMethod, data []byte) ([]byte, error) {
        switch p := method.padding; p {
        case pkcsPadding:
                return pkcsUnpad(data)
        case noPadding:
                return data, nil
        default:
                return nil, errors.AssertionFailedf("cannot unpad for unknown padding: %d", p)
        }
}

func validateDataLength(data []byte, blockSize int) error {
        if dataLength := len(data); dataLength%blockSize != 0 {
                return errors.Wrapf(
                        ErrInvalidDataLength,
                        `data has length %d, which is not a multiple of block size %d`,
                        dataLength, blockSize,
                )
        }
        return nil
}

func encrypt(method cipherMethod, block cipher.Block, iv []byte, data []byte) ([]byte, error) {
        switch m := method.mode; m {
        case cbcMode:
                var err error
                ret := make([]byte, len(data))
                iv, err = zeroPadOrTruncate(iv, block.BlockSize())
                if err != nil {
                        return nil, err
                }
                mode := cipher.NewCBCEncrypter(block, iv)
                mode.CryptBlocks(ret, data)
                return ret, nil
        default:
                return nil, errors.AssertionFailedf("cannot encrypt for unknown mode: %d", m)
        }
}

func decrypt(method cipherMethod, block cipher.Block, iv []byte, data []byte) ([]byte, error) {
        switch m := method.mode; m {
        case cbcMode:
                var err error
                ret := make([]byte, len(data))
                iv, err = zeroPadOrTruncate(iv, block.BlockSize())
                if err != nil {
                        return nil, err
                }
                mode := cipher.NewCBCDecrypter(block, iv)
                mode.CryptBlocks(ret, data)
                return ret, nil
        default:
                return nil, errors.AssertionFailedf("cannot decrypt for unknown mode: %d", m)
        }
}

// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package pgcryptocipherccl

import (
        "crypto/aes"
        "testing"

        "github.com/stretchr/testify/require"
)

func FuzzEncryptDecryptAES(f *testing.F) {
        for _, tc := range CipherTestCases {
                f.Add(tc.Plaintext, tc.Key, tc.Iv)
        }
        f.Fuzz(func(t *testing.T, plaintext []byte, key []byte, iv []byte) {
                ciphertext, err := Encrypt(plaintext, key, iv, "aes")
                require.NoError(t, err)
                decryptedCiphertext, err := Decrypt(ciphertext, key, iv, "aes")
                require.NoError(t, err)
                require.Equal(t, plaintext, decryptedCiphertext)
        })
}

func FuzzNoPaddingEncryptDecryptAES(f *testing.F) {
        for _, tc := range CipherTestCases {
                f.Add(tc.Plaintext, tc.Key, tc.Iv)
        }
        f.Fuzz(func(t *testing.T, plaintext []byte, key []byte, iv []byte) {
                ciphertext, err := Encrypt(plaintext, key, iv, "aes/pad:none")
                if plaintextLength := len(plaintext); plaintextLength%aes.BlockSize != 0 {
                        require.ErrorIs(t, err, ErrInvalidDataLength)
                        return
                }
                require.NoError(t, err)
                decryptedCiphertext, err := Decrypt(ciphertext, key, iv, "aes/pad:none")
                require.NoError(t, err)
                require.Equal(t, plaintext, decryptedCiphertext)
        })
}

// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package pgcryptocipherccl

import (
        "regexp"
        "strings"

        "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
        "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
        "github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented"
)

type cipherAlgorithm int

const (
        _ cipherAlgorithm = iota
        aesCipher
)

type cipherMode int

const (
        cbcMode cipherMode = iota
)

type cipherPadding int

const (
        pkcsPadding cipherPadding = iota
        noPadding
)

type cipherMethod struct {
        algorithm cipherAlgorithm
        mode      cipherMode
        padding   cipherPadding
}

var cipherMethodRE = regexp.MustCompile("^(?P<algorithm>[[:alpha:]]+)(?:-(?P<mode>[[:alpha:]]+))?(?:/pad:(?P<padding>[[:alpha:]]+))?$")

func parseCipherMethod(s string) (cipherMethod, error) {
        submatches := cipherMethodRE.FindStringSubmatch(s)
        if submatches == nil {
                return cipherMethod{}, pgerror.Newf(pgcode.InvalidParameterValue, `cipher method has wrong format: "%s"`, s)
        }

        ret := cipherMethod{}

        switch algorithm := submatches[cipherMethodRE.SubexpIndex("algorithm")]; strings.ToLower(algorithm) {
        case "aes":
                ret.algorithm = aesCipher
        case "bf":
                return cipherMethod{}, unimplemented.NewWithIssue(105466, "Blowfish is insecure and not supported")
        default:
                return cipherMethod{}, pgerror.Newf(pgcode.InvalidParameterValue, `cipher method has invalid algorithm: "%s"`, algorithm)
        }

        switch mode := submatches[cipherMethodRE.SubexpIndex("mode")]; strings.ToLower(mode) {
        case "", "cbc":
        case "ecb":
                return cipherMethod{}, unimplemented.NewWithIssue(105466, "ECB mode is insecure and not supported")
        default:
                return cipherMethod{}, pgerror.Newf(pgcode.InvalidParameterValue, `cipher method has invalid mode: "%s"`, mode)
        }

        switch padding := submatches[cipherMethodRE.SubexpIndex("padding")]; strings.ToLower(padding) {
        case "", "pkcs":
        case "none":
                ret.padding = noPadding
        default:
                return cipherMethod{}, pgerror.Newf(pgcode.InvalidParameterValue, `cipher method has invalid padding: "%s"`, padding)
        }

        return ret, nil
}

// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package pgcryptocipherccl

import (
        "bytes"
        "math"

        "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
        "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
        "github.com/cockroachdb/errors"
)

// pkcsPad pads a slice of bytes to a multiple of the given block size
// using the process specified in
// https://datatracker.ietf.org/doc/html/rfc5652#section-6.3.
func pkcsPad(data []byte, blockSize int) ([]byte, error) {
        if blockSize <= 0 || blockSize > math.MaxUint8 {
                return nil, errors.AssertionFailedf("invalid block size for PKCS padding: %d", blockSize)
        }

        paddedData := make([]byte, len(data))
        copy(paddedData, data)

        paddingSize := blockSize - len(data)%blockSize
        padding := bytes.Repeat([]byte{byte(paddingSize)}, paddingSize)
        paddedData = append(paddedData, padding...)

        return paddedData, nil
}

// pkcsUnpad removes the padding added by pkcsPad.
func pkcsUnpad(data []byte) ([]byte, error) {
        if len(data) == 0 {
                return nil, pgerror.New(pgcode.InvalidParameterValue, "PKCS-padded data is empty")
        }

        paddingLen := data[len(data)-1]
        if paddingLen == 0 || int(paddingLen) > len(data) {
                return nil, pgerror.Newf(pgcode.InvalidParameterValue,
                        "invalid final byte found in PKCS-padded data: %d", paddingLen)
        }
        for i := 1; i < int(paddingLen); i++ {
                if b := data[len(data)-i-1]; b != paddingLen {
                        return nil, pgerror.Newf(pgcode.InvalidParameterValue,
                                "invalid byte found in PKCS-padded data: expected %d, but found %d", paddingLen, b)
                }
        }

        return data[:len(data)-int(paddingLen)], nil
}

// zeroPadOrTruncate pads a slice of bytes with zeroes if its length is smaller
// than size and truncates the slice to length size otherwise.
func zeroPadOrTruncate(data []byte, size int) ([]byte, error) {
        if size < 0 {
                return nil, errors.AssertionFailedf("cannot zero pad or truncate to negative size")
        }
        if len(data) >= size {
                return data[:size], nil
        }
        paddedData := make([]byte, size)
        copy(paddedData, data)
        return paddedData, nil
}

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/util"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/errors"
)

// BackupCompactionIterator wraps a SimpleMVCCIterator and only surfaces the
// latest valid key of a given MVCC key, including point tombstones, at or below the
// asOfTimestamp, if set.
//
// The iterator assumes that it will not encounter any write intents and that the
// wrapped SimpleMVCCIterator *only* surfaces point keys.
type BackupCompactionIterator struct {
        iter SimpleMVCCIterator

        // asOf is the latest timestamp of a key surfaced by the iterator.
        asOf hlc.Timestamp

        // valid tracks if the current key is valid.
        valid bool

        // err tracks if iterating to the current key returned an error.
        err error
}

var _ SimpleMVCCIterator = &BackupCompactionIterator{}

// NewBackupCompactionIterator creates a new BackupCompactionIterator. The asOf timestamp cannot be empty.
func NewBackupCompactionIterator(
        iter SimpleMVCCIterator, asOf hlc.Timestamp,
) (*BackupCompactionIterator, error) {
        if asOf.IsEmpty() {
                return nil, errors.New("asOf timestamp cannot be empty")
        }
        return &BackupCompactionIterator{
                iter: iter,
                asOf: asOf,
        }, nil
}

func (f *BackupCompactionIterator) Close() {
        f.iter.Close()
}

// Next is identical to NextKey, as BackupCompactionIterator only surfaces live keys.
func (f *BackupCompactionIterator) Next() {
        f.NextKey()
}

func (f *BackupCompactionIterator) NextKey() {
        f.iter.NextKey()
        f.advance()
}

func (f *BackupCompactionIterator) SeekGE(originalKey MVCCKey) {
        // See ReadAsOfIterator comment for explanation of this.
        synthetic := MVCCKey{Key: originalKey.Key, Timestamp: f.asOf}
        f.iter.SeekGE(synthetic)
        if f.advance(); f.valid && f.UnsafeKey().Less(originalKey) {
                f.NextKey()
        }
}

func (f *BackupCompactionIterator) updateValid() bool {
        f.valid, f.err = f.iter.Valid()
        return f.valid
}

// advance moves past keys with timestamps later than f.asOf.
func (f *BackupCompactionIterator) advance() {
        for {
                if ok := f.updateValid(); !ok {
                        return
                }
                if key := f.iter.UnsafeKey(); f.asOf.Less(key.Timestamp) {
                        f.iter.Next()
                        continue
                }
                return
        }
}

func (f *BackupCompactionIterator) UnsafeKey() MVCCKey {
        return f.iter.UnsafeKey()
}

func (f *BackupCompactionIterator) UnsafeValue() ([]byte, error) {
        return f.iter.UnsafeValue()
}

func (f *BackupCompactionIterator) Valid() (bool, error) {
        if util.RaceEnabled && f.valid {
                if err := f.assertInvariants(); err != nil {
                        return false, err
                }
        }
        return f.valid, f.err
}

func (f *BackupCompactionIterator) MVCCValueLenAndIsTombstone() (int, bool, error) {
        return f.iter.MVCCValueLenAndIsTombstone()
}

func (f *BackupCompactionIterator) ValueLen() int {
        return f.iter.ValueLen()
}

func (f *BackupCompactionIterator) HasPointAndRange() (bool, bool) {
        hasPoint, hasRange := f.iter.HasPointAndRange()
        if hasRange {
                panic("unexpected range tombstone")
        }
        return hasPoint, hasRange
}

func (f *BackupCompactionIterator) RangeBounds() roachpb.Span {
        return roachpb.Span{}
}

func (f *BackupCompactionIterator) RangeKeys() MVCCRangeKeyStack {
        return MVCCRangeKeyStack{}
}

func (f *BackupCompactionIterator) RangeKeyChanged() bool {
        return false
}

// assertInvariants checks that the iterator is in a valid state, but first assumes that the underlying iterator
// has already been validated and is in a valid state.
func (f *BackupCompactionIterator) assertInvariants() error {
        if err := assertSimpleMVCCIteratorInvariants(f); err != nil {
                return err
        }

        if ok, err := f.iter.Valid(); !ok || err != nil {
                errMsg := err.Error()
                return errors.AssertionFailedf("invalid underlying iter with err=%s", errMsg)
        }

        key := f.UnsafeKey()
        if key.Timestamp.IsEmpty() {
                return errors.AssertionFailedf("emitted key %s has no timestamp", key)
        }
        if f.asOf.Less(key.Timestamp) {
                return errors.AssertionFailedf("emitted key %s above asOf timestamp %s", key, f.asOf)
        }

        return nil
}

// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "github.com/cockroachdb/cockroach/pkg/base"
        "github.com/cockroachdb/cockroach/pkg/storage/storagepb"
        "github.com/cockroachdb/cockroach/pkg/util/envutil"
        "github.com/cockroachdb/cockroach/pkg/util/sysutil"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/errors/oserror"
        "github.com/cockroachdb/pebble/vfs"
)

// ballastsEnabled allows overriding the automatic creation of the ballast
// files through an environment variable. Developers working on CockroachDB
// may want to include `COCKROACH_AUTO_BALLAST=false` in their environment to
// prevent the automatic creation of large ballast files on their local
// filesystem.
var ballastsEnabled bool = envutil.EnvOrDefaultBool("COCKROACH_AUTO_BALLAST", true)

// IsDiskFull examines the store indicated by spec, determining whether the
// store's underlying disk is out of disk space. A disk is considered to be
// full if available capacity is less than half of the store's ballast size.
//
// If the current on-disk ballast does not match the configured ballast size
// in spec, IsDiskFull will resize the file if available capacity allows.
func IsDiskFull(fs vfs.FS, spec base.StoreSpec) (bool, error) {
        if spec.InMemory {
                return false, nil
        }

        // The store directory might not exist yet. We don't want to try to create
        // it yet, because there might not be any disk space to do so. Check the
        // disk usage on the first parent that exists.
        path := spec.Path
        diskUsage, err := fs.GetDiskUsage(path)
        for oserror.IsNotExist(err) {
                if parentPath := fs.PathDir(path); parentPath == path {
                        break
                } else {
                        path = parentPath
                }
                diskUsage, err = fs.GetDiskUsage(path)
        }
        if err != nil {
                return false, errors.Wrapf(err, "retrieving disk usage: %s", spec.Path)
        }

        // Try to resize the ballast now, if necessary. This is necessary to
        // truncate the ballast if a new, lower ballast size was provided,
        // and the disk space freed by truncation will allow us to start. If
        // we need to create or grow the ballast but are unable because
        // there's insufficient disk space, it'll be resized by the periodic
        // capacity calculations when the conditions are met.
        desiredSizeBytes := BallastSizeBytes(spec, diskUsage)
        ballastPath := base.EmergencyBallastFile(fs.PathJoin, spec.Path)
        if resized, err := maybeEstablishBallast(fs, ballastPath, desiredSizeBytes, diskUsage); err != nil {
                return false, err
        } else if resized {
                diskUsage, err = fs.GetDiskUsage(path)
                if err != nil {
                        return false, errors.Wrapf(err, "retrieving disk usage: %s", spec.Path)
                }
        }

        // If the filesystem reports less than half the disk space available,
        // consider the disk full. If the ballast hasn't been removed yet,
        // removing it will free enough disk space to start. We don't use exactly
        // the ballast size in case some of the headroom gets consumed elsewhere:
        // eg, the operator's shell history, system logs, copy-on-write filesystem
        // metadata, etc.
        return diskUsage.AvailBytes < uint64(desiredSizeBytes/2), nil
}

// BallastSizeBytes returns the desired size of the emergency ballast,
// calculated from the provided store spec and disk usage. If the store spec
// contains an explicit ballast size (either in bytes or as a percentage of
// the disk's total capacity), the store spec's size is used. Otherwise,
// BallastSizeBytes returns 1GiB or 1% of total capacity, whichever is
// smaller.
func BallastSizeBytes(spec base.StoreSpec, diskUsage vfs.DiskUsage) int64 {
        if spec.BallastSize != nil {
                v := spec.BallastSize.Capacity
                if spec.BallastSize.Percent != 0 {
                        v = int64(float64(diskUsage.TotalBytes) * spec.BallastSize.Percent / 100)
                }
                return v
        }

        // Default to a 1% or 1GiB ballast, whichever is smaller.
        var v int64 = 1 << 30 // 1 GiB
        if p := int64(float64(diskUsage.TotalBytes) * 0.01); v > p {
                v = p
        }
        return v
}

// SecondaryCacheBytes returns the desired size of the secondary cache, calculated
// from the provided store spec and disk usage. If the store spec contains an
// explicit ballast size (either in bytes or as a percentage of the disk's total
// capacity), that size is used. A zero value for cacheSize results in no
// secondary cache.
func SecondaryCacheBytes(cacheSize storagepb.SizeSpec, diskUsage vfs.DiskUsage) int64 {
        v := cacheSize.Capacity
        if cacheSize.Percent != 0 {
                v = int64(float64(diskUsage.TotalBytes) * cacheSize.Percent / 100)
        }
        return v
}

func maybeEstablishBallast(
        fs vfs.FS, ballastPath string, ballastSizeBytes int64, diskUsage vfs.DiskUsage,
) (resized bool, err error) {
        var currentSizeBytes int64
        fi, err := fs.Stat(ballastPath)
        if err != nil && !oserror.IsNotExist(err) {
                return false, err
        } else if err == nil {
                currentSizeBytes = fi.Size()
        }

        switch {
        case currentSizeBytes > ballastSizeBytes:
                // If the current ballast is too big, shrink it regardless of current
                // disk space availability.
                // TODO(jackson): Expose Truncate on vfs.FS.
                return true, sysutil.ResizeLargeFile(ballastPath, ballastSizeBytes)
        case currentSizeBytes < ballastSizeBytes && ballastsEnabled:
                if err := fs.MkdirAll(fs.PathDir(ballastPath), 0755); err != nil {
                        return false, errors.Wrap(err, "creating data directory")
                }
                // We need to either create the ballast or extend the current ballast
                // to make it larger. The ballast may have been intentionally removed
                // to enable recovery. Only create/extend the ballast if there's
                // sufficient disk space.
                extendBytes := ballastSizeBytes - currentSizeBytes

                // If available disk space is >= 4x the required amount, create the
                // ballast.
                if extendBytes <= int64(diskUsage.AvailBytes)/4 {
                        return true, sysutil.ResizeLargeFile(ballastPath, ballastSizeBytes)
                }

                // If the user configured a really large ballast, we might not ever
                // have >= 4x the required amount available. Larger ballast sizes (eg,
                // 5%, 10%) are not unreasonably large, but it's possible that after
                // recovery available capacity won't exceed 4x the ballast sizes (eg,
                // 20%, 40%). Allow extending the ballast if we will have 10 GiB
                // available after the extension to account for these large ballasts.
                if int64(diskUsage.AvailBytes)-extendBytes > (10 << 30 /* 10 GiB */) {
                        return true, sysutil.ResizeLargeFile(ballastPath, ballastSizeBytes)
                }

                return false, nil
        default:
                return false, nil
        }
}

// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "math"

        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
        "github.com/cockroachdb/pebble/batchrepr"
        "github.com/cockroachdb/pebble/rangekey"
)

// Ensure that we always update the batch reader to consider any necessary
// updates when a new key kind is introduced. To do this, we assert that the
// latest key we considered equals InternalKeyKindMax, ensuring that compilation
// will fail if it's not. Unfortunately, this doesn't protect against reusing a
// currently unused RocksDB key kind.
const _ = uint(pebble.InternalKeyKindExcise - pebble.InternalKeyKindMax)

// decodeBatchHeader decodes the header of Pebble batch representation,
// returning the parsed header and a batchrepr.Reader into the contents of the
// batch.
func decodeBatchHeader(repr []byte) (h batchrepr.Header, r batchrepr.Reader, err error) {
        h, ok := batchrepr.ReadHeader(repr)
        switch {
        case !ok:
                return batchrepr.Header{}, nil, errors.Errorf("batch invalid: too small: %d bytes", len(repr))
        case h.SeqNum != 0:
                return batchrepr.Header{}, nil, errors.Errorf("batch invalid: bad sequence: expected 0, but found %d", h.SeqNum)
        case h.Count > math.MaxInt32:
                return batchrepr.Header{}, nil, errors.Errorf("batch invalid: count %d would overflow 32-bit signed int", h.Count)
        }
        return h, batchrepr.Read(repr), nil
}

// BatchReader is used to iterate the entries in a Pebble batch
// representation.
//
// Example:
//
//        r, err := NewBatchReader(...)
//        if err != nil {
//          return err
//        }
//        for r.Next() {
//                switch r.KeyKind() {
//                case pebble.InternalKeyKindDelete:
//                        fmt.Printf("delete(%x)", r.Key())
//                case pebble.InternalKeyKindSet:
//                        fmt.Printf("put(%x,%x)", r.Key(), r.Value())
//                case pebble.InternalKeyKindMerge:
//                        fmt.Printf("merge(%x,%x)", r.Key(), r.Value())
//                case pebble.InternalKeyKindSingleDelete:
//                        fmt.Printf("single_delete(%x)", r.Key())
//                case pebble.InternalKeyKindRangeDelete:
//                        fmt.Printf("delete_range(%x,%x)", r.Key(), r.Value())
//                 }
//        }
//
//        if err := r.Error(); err != nil {
//          return err
//        }
type BatchReader struct {
        header batchrepr.Header
        reader batchrepr.Reader

        // The error encountered during iterator, if any
        err error

        // The following all represent the current entry and are updated by Next.
        // `value` is not applicable for all key kinds. For RangeDelete, value
        // indicates the end key for the range deletion.
        kind  pebble.InternalKeyKind
        key   []byte
        value []byte
}

// NewBatchReader creates a BatchReader from the given batch repr and
// verifies the header.
func NewBatchReader(repr []byte) (*BatchReader, error) {
        h, r, err := decodeBatchHeader(repr)
        if err != nil {
                return nil, err
        }
        return &BatchReader{header: h, reader: r}, nil
}

// Count returns the declared number of entries in the batch.
func (r *BatchReader) Count() int {
        return int(r.header.Count)
}

// Error returns the error, if any, which the iterator encountered.
func (r *BatchReader) Error() error {
        return r.err
}

// KeyKind returns the kind of the current entry.
func (r *BatchReader) KeyKind() pebble.InternalKeyKind {
        return r.kind
}

// Key returns the key of the current batch entry.
func (r *BatchReader) Key() []byte {
        return r.key
}

// MVCCKey returns the MVCC key of the current batch entry.
func (r *BatchReader) MVCCKey() (MVCCKey, error) {
        return DecodeMVCCKey(r.Key())
}

// EngineKey returns the EngineKey for the current batch entry.
func (r *BatchReader) EngineKey() (EngineKey, error) {
        key, ok := DecodeEngineKey(r.Key())
        if !ok {
                return key, errors.Errorf("invalid encoded engine key: %x", r.Key())
        }
        return key, nil
}

// Value returns the value of the current batch entry. Value panics if the
// kind is a point key deletion.
func (r *BatchReader) Value() []byte {
        switch r.kind {
        case pebble.InternalKeyKindDelete, pebble.InternalKeyKindSingleDelete:
                panic("cannot call Value on a deletion entry")
        default:
                return r.value
        }
}

// EndKey returns the raw end key of the current ranged batch entry.
func (r *BatchReader) EndKey() ([]byte, error) {
        var rawKey []byte
        switch r.kind {
        case pebble.InternalKeyKindRangeDelete:
                rawKey = r.Value()

        case pebble.InternalKeyKindRangeKeySet, pebble.InternalKeyKindRangeKeyUnset, pebble.InternalKeyKindRangeKeyDelete:
                rangeKeys, err := r.rangeKeys()
                if err != nil {
                        return nil, err
                }
                rawKey = rangeKeys.End

        default:
                return nil, errors.AssertionFailedf(
                        "can only ask for EndKey on a ranged entry, got %v", r.kind)
        }
        return rawKey, nil
}

// EngineEndKey returns the engine end key of the current ranged batch entry.
func (r *BatchReader) EngineEndKey() (EngineKey, error) {
        rawKey, err := r.EndKey()
        if err != nil {
                return EngineKey{}, err
        }

        key, ok := DecodeEngineKey(rawKey)
        if !ok {
                return key, errors.Errorf("invalid encoded engine key: %x", rawKey)
        }
        return key, nil
}

// RawRangeKeys returns the raw range key values at the current entry.
func (r *BatchReader) RawRangeKeys() ([]rangekey.Key, error) {
        switch r.kind {
        case pebble.InternalKeyKindRangeKeySet, pebble.InternalKeyKindRangeKeyUnset:
        default:
                return nil, errors.AssertionFailedf(
                        "can only ask for range keys on a range key entry, got %v", r.kind)
        }
        rangeKeys, err := r.rangeKeys()
        if err != nil {
                return nil, err
        }
        return rangeKeys.Keys, nil
}

// EngineRangeKeys returns the engine range key values at the current entry.
func (r *BatchReader) EngineRangeKeys() ([]EngineRangeKeyValue, error) {
        switch r.kind {
        case pebble.InternalKeyKindRangeKeySet, pebble.InternalKeyKindRangeKeyUnset:
        default:
                return nil, errors.AssertionFailedf(
                        "can only ask for range keys on a range key entry, got %v", r.kind)
        }
        rangeKeys, err := r.rangeKeys()
        if err != nil {
                return nil, err
        }
        rkvs := make([]EngineRangeKeyValue, 0, len(rangeKeys.Keys))
        for _, rk := range rangeKeys.Keys {
                rkvs = append(rkvs, EngineRangeKeyValue{Version: rk.Suffix, Value: rk.Value})
        }
        return rkvs, nil
}

// rangeKeys decodes and returns the current Pebble range key.
func (r *BatchReader) rangeKeys() (rangekey.Span, error) {
        return rangekey.Decode(pebble.MakeInternalKey(r.key, 0 /* seqNum */, r.kind), r.value, nil)
}

// Next advances to the next entry in the batch, returning false when the batch
// is empty.
func (r *BatchReader) Next() bool {
        var ok bool
        r.kind, r.key, r.value, ok, r.err = r.reader.Next()
        return ok
}

// BatchCount provides an efficient way to get the count of mutations in a batch
// representation.
func BatchCount(repr []byte) (int, error) {
        h, ok := batchrepr.ReadHeader(repr)
        if !ok {
                return 0, errors.Errorf("invalid batch: batch repr too small: %d bytes", len(repr))
        }
        return int(h.Count), nil
}

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"
        "math"

        "github.com/cockroachdb/cockroach/pkg/col/coldata"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/cockroach/pkg/sql/catalog/fetchpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/util/buildutil"
        "github.com/cockroachdb/cockroach/pkg/util/grpcutil"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/mon"
        "github.com/cockroachdb/errors"
)

// This file defines several interfaces as well as introduces a couple of
// components that power the direct columnar scans. The main idea of this
// feature is to use the injected decoding logic from SQL in order to process
// each KV and keep only the needed parts (i.e. necessary SQL columns). Those
// needed parts are then propagated back to the KV client as coldata.Batch'es
// (serialized in the Apache Arrow format).
//
// Here is an example outline of all components involved:
//
//      ┌────────────────────────────────────────────────┐
//      │                       SQL                      │
//      │________________________________________________│
//      │          colfetcher.ColBatchDirectScan         │
//      │                        │                       │
//      │                        ▼                       │
//      │                 row.txnKVFetcher               │
//      │    (behind the row.KVBatchFetcher interface)   │
//      └────────────────────────────────────────────────┘
//                               │
//                               ▼
//      ┌────────────────────────────────────────────────┐
//      │                    KV Client                   │
//      └────────────────────────────────────────────────┘
//                               │
//                               ▼
//      ┌────────────────────────────────────────────────┐
//      │                    KV Server                   │
//      │________________________________________________│
//      │           colfetcher.cFetcherWrapper           │
//      │ (behind the storage.CFetcherWrapper interface) │
//      │                        │                       │
//      │                        ▼                       │
//      │              colfetcher.cFetcher               │
//      │                        │                       │
//      │                        ▼                       │
//      │          storage.mvccScanFetchAdapter ────────┐│
//      │    (behind the storage.NextKVer interface)    ││
//      │                        │                      ││
//      │                        ▼                      ││
//      │           storage.pebbleMVCCScanner           ││
//      │ (which put's KVs into storage.singleResults) <┘│
//      └────────────────────────────────────────────────┘
//
// On the KV client side, row.txnKVFetcher issues Scans and ReverseScans with
// the COL_BATCH_RESPONSE format and returns the response (which contains the
// columnar data) to the colfetcher.ColBatchDirectScan.
//
// On the KV server side, we create a storage.CFetcherWrapper that asks the
// colfetcher.cFetcher for the next coldata.Batch. The cFetcher, in turn,
// fetches the next KV, decodes it, and keeps only values for the needed SQL
// columns, discarding the rest of the KV. The KV is emitted by the
// mvccScanFetchAdapter which - via the singleResults struct - exposes access to
// the current KV that the pebbleMVCCScanner is pointing at.
//
// Note that there is an additional "implicit synchronization" between
// components that is not shown on this diagram. In particular,
// storage.singleResults.maybeTrimPartialLastRow must be in sync with the
// colfetcher.cFetcher which is achieved by
// - the cFetcher exposing access to the first key of the last incomplete SQL
//   row via the FirstKeyOfRowGetter,
// - the singleResults using that key as the resume key for the response,
// - and the cFetcher removing that last partial SQL row when NextKV() returns
//   partialRow=true.
// This "upstream" link (although breaking the layering a bit) allows us to
// avoid a performance penalty for handling the case with multiple column
// families. (This case is handled by the storage.pebbleResults via tracking
// offsets into the pebbleResults.repr.)
//
// This code structure deserves some elaboration. First, there is a mismatch
// between the "push" mode in which the pebbleMVCCScanner operates and the
// "pull" mode that the NextKVer exposes. The adaption between two different
// modes is achieved via the mvccScanFetcherAdapter grabbing (when the control
// returns to it) the current unstable KV pair from the singleResults struct
// which serves as a one KV pair buffer that the pebbleMVCCScanner `put`s into.
// Second, in order be able to use the unstable KV pair without performing a
// copy, the pebbleMVCCScanner stops at the current KV pair and returns the
// control flow (which is exactly what pebbleMVCCScanner.getOne does) back to
// the mvccScanFetcherAdapter, with the adapter advancing the scanner only when
// the next KV pair is needed.

// FirstKeyOfRowGetter returns the first key included into the last incomplete
// SQL row by the user of NextKVer. If the last row is complete, then nil is
// returned.
type FirstKeyOfRowGetter func() roachpb.Key

// NextKVer can fetch a new KV from somewhere. If MVCCDecodingStrategy is set
// to MVCCDecodingRequired, the returned KV will include a timestamp.
type NextKVer interface {
        // Init initializes the NextKVer. It returns a boolean indicating whether
        // the KVs returned by NextKV are stable (i.e. whether they will not be
        // invalidated by calling NextKV again).
        Init(getter FirstKeyOfRowGetter) (stableKVs bool)
        // NextKV returns the next kv from this NextKVer.
        // - ok=false indicates that there are no more kvs to fetch,
        // - partialRow indicates whether the fetch stopped in the middle of a SQL
        // row (in this case ok will be set to false),
        // - the kv that was fetched,
        // - any errors that may have occurred.
        //
        // When (ok=false,partialRow=true) is returned, the caller is expected to
        // discard all KVs that were part of the last SQL row that was incomplete.
        // The scan will be resumed from the key provided by the FirstKeyOfRowGetter
        // (provided in Init by the caller) obtained during this NextKV call.
        NextKV(context.Context, MVCCDecodingStrategy) (ok bool, partialRow bool, kv roachpb.KeyValue, err error)
}

// CFetcherWrapper is a wrapper around a colfetcher.cFetcher that populates only
// the needed (according to the fetchpb.IndexFetchSpec) vectors which are
// returned as coldata.Batch'es (either serialized or as is).
//
// Currently, non-enum user-defined types are unsupported when they are included
// as "needed" in the fetchpb.IndexFetchSpec (#92954).
type CFetcherWrapper interface {
        // NextBatch gives back the next column-oriented batch, possibly serialized
        // in Arrow batch format. All calls to NextBatch will use the same format
        // (i.e. either all batches are serialized or none are).
        //
        // Regardless of the return format, consequent calls to NextBatch do **not**
        // invalidate the result of the previous calls. Additionally, the memory
        // accounting for all returned batches throughout the lifetime of the
        // CFetcherWrapper is done against the provided in GetCFetcherWrapper()
        // memory account.
        NextBatch(ctx context.Context) ([]byte, coldata.Batch, error)

        // Close release the resources held by this CFetcherWrapper. It *must* be
        // called after use of the wrapper.
        Close(ctx context.Context)
}

// GetCFetcherWrapper returns a CFetcherWrapper. It's injected from
// pkg/sql/colfetcher to avoid circular dependencies since storage can't depend
// on higher levels of the system.
var GetCFetcherWrapper func(
        ctx context.Context,
        st *cluster.Settings,
        acc *mon.BoundAccount,
        indexFetchSpec *fetchpb.IndexFetchSpec,
        nextKVer NextKVer,
        startKey roachpb.Key,
        mustSerialize bool,
) (CFetcherWrapper, error)

// onNextKVFn represents the transition that the mvccScanFetchAdapter needs to
// perform on the following NextKV() call.
type onNextKVFn int

const (
        _ onNextKVFn = iota
        // onNextKVSeek is the initial state of the mvccScanFetchAdapter where it
        // must seek to the start of the scan. The state machine will then
        // transition to the onNextKVAdvance state.
        onNextKVSeek
        // onNextKVAdvance is the main state of the mvccScanFetchAdapter where it
        // advances the scanner to the next KV (which is then returned on the NextKV
        // call). Once there are no more KVs to scan (either because the scan was
        // exhausted or some kind of limit was reached), the state machine will
        // transition to the onNextKVDone state.
        onNextKVAdvance
        // onNextKVDone is the final state of the mvccScanFetchAdapter which
        // indicates that the current scan is complete.
        onNextKVDone
)

// mvccScanFetchAdapter is a NextKVer that is powered directly by the
// pebbleMVCCScanner. Each time its NextKV is called, it advances the pebble
// iterator (possibly several times if some KVs need to be skipped) and returns
// a single KV. Note that the returned KV is only valid until the next call to
// NextKV.
type mvccScanFetchAdapter struct {
        scanner *pebbleMVCCScanner
        machine onNextKVFn
        results singleResults
}

var _ NextKVer = &mvccScanFetchAdapter{}

// Init implements the NextKVer interface.
func (f *mvccScanFetchAdapter) Init(firstKeyGetter FirstKeyOfRowGetter) (stableKVs bool) {
        f.results.firstKeyGetter = firstKeyGetter
        // The returned kv is never stable because it'll be invalidated by the
        // pebbleMVCCScanner on each NextKV() call.
        return false
}

// NextKV implements the NextKVer interface.
func (f *mvccScanFetchAdapter) NextKV(
        ctx context.Context, mvccDecodingStrategy MVCCDecodingStrategy,
) (ok bool, partialRow bool, kv roachpb.KeyValue, err error) {
        // Loop until we add a KV into the results (KVs might be skipped due to
        // having been deleted).
        // TODO(yuzefovich, 23.1): check whether having this loop has noticeable
        // impact on the performance.
        for added := false; !added; {
                // Perform the action according to the current state.
                switch f.machine {
                case onNextKVSeek:
                        if !f.scanner.seekToStartOfScan() {
                                return false, false, roachpb.KeyValue{}, f.scanner.err
                        }
                        f.machine = onNextKVAdvance
                case onNextKVAdvance:
                        if !f.scanner.advance() {
                                // No more keys in the scan.
                                return false, false, roachpb.KeyValue{}, nil
                        }
                case onNextKVDone:
                        // No more keys in the scan.
                        return false, f.results.partialRowTrimmed, roachpb.KeyValue{}, nil
                }
                ok, added = f.scanner.getOne(ctx)
                if !ok {
                        // ok=false indicates that the iteration must stop, so we're done
                        // after we process the current KV (if it was added).
                        f.machine = onNextKVDone
                }
        }
        // We have a KV to return. Decode it according to mvccDecodingStrategy.
        kv = f.results.getLastKV()
        mvccKey := kv.Key
        if buildutil.CrdbTestBuild {
                if len(mvccKey) == 0 || len(kv.Value.RawBytes) == 0 {
                        return false, false, kv, errors.AssertionFailedf("unexpectedly received an empty lastKV")
                }
        }
        switch mvccDecodingStrategy {
        case MVCCDecodingRequired:
                kv.Key, kv.Value.Timestamp, err = enginepb.DecodeKey(mvccKey)
                if err != nil {
                        return false, false, kv, errors.AssertionFailedf("invalid encoded mvcc key: %x", mvccKey)
                }
        case MVCCDecodingNotRequired:
                kv.Key, _, ok = enginepb.SplitMVCCKey(mvccKey)
                if !ok {
                        return false, false, kv, errors.AssertionFailedf("invalid encoded mvcc key: %x", mvccKey)
                }
        }
        return true, false, kv, nil
}

// singleResults is an implementation of the results interface that is able to
// hold only a single KV at a time - all KVs are "accumulated" in the
// colfetcher.cFetcher.
//
// Here is how all things fit together:
//   - the colfetcher.cFetcher calls NextKV on the mvccScanFetchAdapter;
//   - the mvccScanFetchAdapter advances the pebbleMVCCScanner to the next key;
//   - the mvccScanFetchAdapter asks the scanner to `getOne` which `put`s a new
//     KV into the `singleResults`. Importantly, the pebbleMVCCScanner is not
//     eagerly advancing further which allows us to just use the unstable
//     key-value from the pebbleMVCCScanner;
//   - the mvccScanFetchAdapter peeks into the `singleResults` struct to extract
//     the new KV, possibly decodes the timestamp, and returns it to the
//     colfetcher.cFetcher for processing;
//   - the colfetcher.cFetcher decodes the KV, and goes back to the first step.
type singleResults struct {
        maxKeysPerRow  uint32
        maxFamilyID    uint32
        onClear        func()
        count, bytes   int64
        mvccKey        []byte
        value          []byte
        firstKeyGetter FirstKeyOfRowGetter
        // firstRowKeyPrefix is a deep copy of the "row prefix" of the first SQL row
        // seen by the singleResults (only set when the table has multiple column
        // families).
        firstRowKeyPrefix []byte
        partialRowTrimmed bool
}

var _ results = &singleResults{}

// clear implements the results interface.
func (s *singleResults) clear() {
        s.onClear()
        *s = singleResults{}
}

func singleResultsKVSizeOf(lenKey, lenValue int) int64 {
        // TODO(yuzefovich, 23.1): come up with a formula that better represents the
        // footprint of the serialized batches.
        return int64(lenKey + lenValue)
}

// sizeInfo implements the results interface.
func (s *singleResults) sizeInfo(lenKey, lenValue int) (numKeys, numBytes, numBytesInc int64) {
        numKeys = s.count
        // TODO(yuzefovich, 23.1): consider using the footprint of coldata.Batches
        // so far (or of serialized representations) here.
        numBytes = s.bytes
        numBytesInc = singleResultsKVSizeOf(lenKey, lenValue)
        return numKeys, numBytes, numBytesInc
}

// put implements the results interface.
func (s *singleResults) put(
        ctx context.Context, mvccKey []byte, value []byte, memAccount *mon.BoundAccount, _ int,
) error {
        bytesInc := singleResultsKVSizeOf(len(mvccKey), len(value))
        if err := memAccount.Grow(ctx, bytesInc); err != nil {
                return err
        }
        s.count++
        s.bytes += bytesInc
        s.mvccKey = mvccKey
        s.value = value
        if s.count == 1 && s.maxKeysPerRow > 1 {
                // If this is the first key, and we have multiple column families, then
                // we store the deep-copied row prefix of this key. This is needed to
                // implement continuesFirstRow.
                key, _, ok := enginepb.SplitMVCCKey(mvccKey)
                if !ok {
                        return errors.AssertionFailedf("invalid encoded mvcc key: %x", mvccKey)
                }
                firstRowKeyPrefix := getRowPrefix(key)
                s.firstRowKeyPrefix = make([]byte, len(firstRowKeyPrefix))
                copy(s.firstRowKeyPrefix, firstRowKeyPrefix)
        }
        return nil
}

// continuesFirstRow implements the results interface.
func (s *singleResults) continuesFirstRow(key roachpb.Key) bool {
        rowPrefix := getRowPrefix(key)
        if rowPrefix == nil {
                return false
        }
        return bytes.Equal(rowPrefix, s.firstRowKeyPrefix)
}

// maybeTrimPartialLastRow implements the results interface.
func (s *singleResults) maybeTrimPartialLastRow(key roachpb.Key) (roachpb.Key, error) {
        firstKeyOfRow := s.firstKeyGetter()
        // getRowPrefix handles the case of empty key, so we don't need to check
        // that explicitly upfront.
        if !bytes.Equal(getRowPrefix(firstKeyOfRow), getRowPrefix(key)) {
                // The given key is the first KV of the next row, so we will resume the
                // scan from this key.
                return key, nil
        }
        // The given key is part of the current last row, and it will be removed by
        // the cFetcher (since NextKV() will return partialRow=true before the row
        // can be completed), thus, we'll resume the scan from the first key in the
        // last row.
        s.partialRowTrimmed = true
        return firstKeyOfRow, nil
}

// lastRowHasFinalColumnFamily implements the results interface.
func (s *singleResults) lastRowHasFinalColumnFamily(reverse bool) bool {
        key, _, ok := enginepb.SplitMVCCKey(s.mvccKey)
        if !ok {
                return false
        }
        return keyHasFinalColumnFamily(key, s.maxFamilyID, reverse)
}

func (s *singleResults) getLastKV() roachpb.KeyValue {
        return roachpb.KeyValue{
                Key:   s.mvccKey,
                Value: roachpb.Value{RawBytes: s.value},
        }
}

// MVCCScanToCols is like MVCCScan, but it returns KVData in a serialized
// columnar batch suitable for reading by colserde.RecordBatchDeserializer.
func MVCCScanToCols(
        ctx context.Context,
        reader Reader,
        indexFetchSpec *fetchpb.IndexFetchSpec,
        key, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCScanOptions,
        st *cluster.Settings,
) (MVCCScanResult, error) {
        iter, err := newMVCCIterator(
                ctx, reader, timestamp, !opts.Tombstones, opts.DontInterleaveIntents, IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        LowerBound:   key,
                        UpperBound:   endKey,
                        ReadCategory: opts.ReadCategory,
                },
        )
        if err != nil {
                return MVCCScanResult{}, err
        }
        defer iter.Close()
        return mvccScanToCols(ctx, iter, indexFetchSpec, key, endKey, timestamp, opts, st)
}

func mvccScanToCols(
        ctx context.Context,
        iter MVCCIterator,
        indexFetchSpec *fetchpb.IndexFetchSpec,
        key, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCScanOptions,
        st *cluster.Settings,
) (MVCCScanResult, error) {
        mvccScanner := pebbleMVCCScannerPool.Get().(*pebbleMVCCScanner)
        adapter := mvccScanFetchAdapter{machine: onNextKVSeek}
        adapter.results.maxKeysPerRow = indexFetchSpec.MaxKeysPerRow
        adapter.results.maxFamilyID = uint32(indexFetchSpec.MaxFamilyID)
        ok, res, err := mvccScanInit(mvccScanner, iter, key, endKey, timestamp, opts, &adapter.results)
        if !ok {
                return res, err
        }
        defer mvccScanner.release()
        adapter.scanner = mvccScanner

        // Try to use the same root monitor (from the store) if the account is
        // provided.
        var monitor *mon.BytesMonitor
        if opts.MemoryAccount != nil {
                monitor = opts.MemoryAccount.Monitor()
        } else {
                // If we don't have the monitor, then we create a "fake" one that is not
                // connected to the memory accounting system.
                monitor = mon.NewMonitor(mon.Options{
                        Name:     mon.MakeMonitorName("mvcc-scan-to-cols"),
                        Settings: st,
                })
                monitor.Start(ctx, nil /* pool */, mon.NewStandaloneBudget(math.MaxInt64))
                defer monitor.Stop(ctx)
        }
        acc := monitor.MakeBoundAccount()
        defer acc.Close(ctx)
        _, isLocal := grpcutil.IsLocalRequestContext(ctx)
        // Note that the CFetcherWrapper might still serialize the batches even for
        // local requests.
        mustSerialize := !isLocal
        wrapper, err := GetCFetcherWrapper(
                ctx,
                st,
                &acc,
                indexFetchSpec,
                &adapter,
                key,
                mustSerialize,
        )
        if err != nil {
                return MVCCScanResult{}, err
        }
        defer wrapper.Close(ctx)

        adapter.results.onClear = func() {
                // Discard the accumulated batches on results.clear() call - the scan
                // will result in an error.
                res = MVCCScanResult{}
        }
        for {
                serializedBatch, colBatch, err := wrapper.NextBatch(ctx)
                if err != nil {
                        return res, err
                }
                if serializedBatch == nil && colBatch == nil {
                        break
                }
                if len(serializedBatch) > 0 {
                        res.KVData = append(res.KVData, serializedBatch)
                } else {
                        res.ColBatches = append(res.ColBatches, colBatch)
                }
        }
        if buildutil.CrdbTestBuild {
                if mustSerialize && len(res.ColBatches) > 0 {
                        return MVCCScanResult{}, errors.AssertionFailedf(
                                "in-memory batches returned by the CFetcherWrapper when serialization is required",
                        )
                }
                if len(res.KVData) > 0 && len(res.ColBatches) > 0 {
                        return MVCCScanResult{}, errors.AssertionFailedf(
                                "both serialized and in-memory batches returned by the CFetcherWrapper",
                        )
                }
        }

        res.ResumeSpan, res.ResumeReason, res.ResumeNextBytes, err = mvccScanner.afterScan()
        if err != nil {
                return MVCCScanResult{}, err
        }
        if err = finalizeScanResult(mvccScanner, &res, opts); err != nil {
                return MVCCScanResult{}, err
        }
        return res, nil
}

// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"
        "slices"
        "sync/atomic"

        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/diskmap"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/util/encoding"
        "github.com/cockroachdb/cockroach/pkg/util/log"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
)

// defaultBatchCapacityBytes is the default capacity for a
// SortedDiskMapBatchWriter.
const defaultBatchCapacityBytes = 4096

// tempStorageID is the temp ID generator for a node. It generates unique
// prefixes for NewPebbleMap. It is a global because NewPebbleMap needs to
// prefix its writes uniquely, and using a global prevents users from having to
// specify the prefix themselves and correctly guarantee that it is unique.
var tempStorageID uint64

func generateTempStorageID() uint64 {
        return atomic.AddUint64(&tempStorageID, 1)
}

// pebbleMapBatchWriter batches writes to a pebbleMap.
type pebbleMapBatchWriter struct {
        // capacity is the number of bytes to write before a Flush() is triggered.
        capacity int

        // makeKey is a function that transforms a key into a byte slice with a prefix
        // to be written to the underlying store.
        makeKey func(k []byte) []byte
        batch   *pebble.Batch
        // onFlush will be called after every batch commit.
        onFlush           func()
        numPutsSinceFlush int
}

// pebbleMapIterator iterates over the keys of a pebbleMap in sorted order.
type pebbleMapIterator struct {
        allowDuplicates bool
        iter            *pebble.Iterator
        // prefixLen is the length of the prefix of keys that this iterator iterates
        // over.
        prefixLen int
        // makeKeyScratch is a scratch space reused when transforming a key into a
        // byte slice with a prefix used to SeekGE() the iterator. First prefixLen
        // bytes are always the prefix of all keys touched by this iterator.
        makeKeyScratch []byte
}

// pebbleMap is a SortedDiskMap that uses pebble as its underlying storage
// engine.
type pebbleMap struct {
        // prefix always stores the unique prefix shared by all keys in the map.
        prefix          []byte
        store           *pebble.DB
        allowDuplicates bool
        keyID           int64
}

var _ diskmap.SortedDiskMapBatchWriter = &pebbleMapBatchWriter{}
var _ diskmap.SortedDiskMapIterator = &pebbleMapIterator{}
var _ diskmap.SortedDiskMap = &pebbleMap{}

// newPebbleMap creates a new pebbleMap with the passed in Engine as the
// underlying store. The pebbleMap instance will have a keyspace prefixed by a
// unique prefix. The allowDuplicates parameter controls whether Puts with
// identical keys will write multiple entries or overwrite previous entries.
func newPebbleMap(e *pebble.DB, allowDuplicates bool) *pebbleMap {
        prefix := generateTempStorageID()
        return &pebbleMap{
                prefix:          encoding.EncodeUvarintAscending([]byte(nil), prefix),
                store:           e,
                allowDuplicates: allowDuplicates,
        }
}

// makeKey appends k to the pebbleMap's prefix to keep the key local to this
// instance and returns a byte slice containing the user-provided key and the
// prefix. Pebble's operations can take this byte slice as a key. This key is
// only valid until the next call to makeKey.
func (r *pebbleMap) makeKey(k []byte) []byte {
        prefixLen := len(r.prefix)
        r.prefix = append(r.prefix, k...)
        key := r.prefix
        r.prefix = r.prefix[:prefixLen]
        return key
}

// makeKeyWithSequence makes a key appropriate for a Put operation. It is like
// makeKey except it respects allowDuplicates, by appending a sequence number to
// the user-provided key.
func (r *pebbleMap) makeKeyWithSequence(k []byte) []byte {
        byteKey := r.makeKey(k)
        if r.allowDuplicates {
                r.keyID++
                byteKey = encoding.EncodeUint64Ascending(byteKey, uint64(r.keyID))
        }
        return byteKey
}

// NewIterator implements the SortedDiskMap interface.
func (r *pebbleMap) NewIterator() diskmap.SortedDiskMapIterator {
        iter, err := r.store.NewIter(&pebble.IterOptions{
                UpperBound: roachpb.Key(r.prefix).PrefixEnd(),
        })
        if err != nil {
                // TODO(bilal): Update all diskMap interfaces to allow returning errors here.
                panic(err)
        }
        return &pebbleMapIterator{
                allowDuplicates: r.allowDuplicates,
                iter:            iter,
                prefixLen:       len(r.prefix),
                makeKeyScratch:  append([]byte{}, r.prefix...),
        }
}

// NewBatchWriter implements the SortedDiskMap interface.
func (r *pebbleMap) NewBatchWriter() diskmap.SortedDiskMapBatchWriter {
        return r.NewBatchWriterCapacity(defaultBatchCapacityBytes)
}

// NewBatchWriterCapacity implements the SortedDiskMap interface.
func (r *pebbleMap) NewBatchWriterCapacity(capacityBytes int) diskmap.SortedDiskMapBatchWriter {
        makeKey := r.makeKey
        if r.allowDuplicates {
                makeKey = r.makeKeyWithSequence
        }
        b := &pebbleMapBatchWriter{
                capacity: capacityBytes,
                makeKey:  makeKey,
                batch:    r.store.NewBatch(),
        }
        b.onFlush = func() {
                // If we happened to have Put very large keys, we want to lose
                // references to them.
                r.gcPrefixSlice()
                b.numPutsSinceFlush = 0
                b.batch = r.store.NewBatch()
        }
        return b
}

const maxPrefixCapReuse = 1 << 20 /* 1 MiB */

func (r *pebbleMap) gcPrefixSlice() {
        if cap(r.prefix) > maxPrefixCapReuse {
                r.prefix = slices.Clone(r.prefix)
        }
}

// Clear implements the SortedDiskMap interface.
func (r *pebbleMap) Clear() error {
        if err := r.store.DeleteRange(
                r.prefix,
                roachpb.Key(r.prefix).PrefixEnd(),
                pebble.NoSync,
        ); err != nil {
                return errors.Wrapf(err, "unable to clear range with prefix %v", r.prefix)
        }
        // NB: we manually flush after performing the clear range to ensure that the
        // range tombstone is pushed to disk which will kick off compactions that
        // will eventually free up the deleted space.
        _, err := r.store.AsyncFlush()
        return err
}

// Close implements the SortedDiskMap interface.
func (r *pebbleMap) Close(ctx context.Context) {
        if err := r.Clear(); err != nil {
                log.Errorf(ctx, "%v", err)
        }
}

// makeKey is a function that transforms a key into a byte slice with a prefix
// used to SeekGE() the underlying iterator. This key is only valid until the
// next call to makeKey and **cannot** be mutated.
func (i *pebbleMapIterator) makeKey(k []byte) []byte {
        i.makeKeyScratch = append(i.makeKeyScratch[:i.prefixLen], k...)
        return i.makeKeyScratch
}

// SeekGE implements the SortedDiskMapIterator interface.
func (i *pebbleMapIterator) SeekGE(k []byte) {
        i.iter.SeekGE(i.makeKey(k))
}

// Rewind implements the SortedDiskMapIterator interface.
func (i *pebbleMapIterator) Rewind() {
        i.iter.SeekGE(i.makeKey(nil))
}

// Valid implements the SortedDiskMapIterator interface.
func (i *pebbleMapIterator) Valid() (bool, error) {
        return i.iter.Valid(), nil
}

// Next implements the SortedDiskMapIterator interface.
func (i *pebbleMapIterator) Next() {
        i.iter.Next()
}

// UnsafeKey implements the SortedDiskMapIterator interface.
func (i *pebbleMapIterator) UnsafeKey() []byte {
        unsafeKey := i.iter.Key()
        end := len(unsafeKey)
        if i.allowDuplicates {
                // There are 8 bytes of sequence number at the end of the key, remove them.
                end -= 8
        }
        return unsafeKey[i.prefixLen:end]
}

// UnsafeValue implements the SortedDiskMapIterator interface.
func (i *pebbleMapIterator) UnsafeValue() []byte {
        // TODO(sumeer): switch to using ValueAndErr. Since error only happens for
        // non in-place values, and temp engines only have in-place values, this
        // change is not critical.
        return i.iter.Value()
}

// Close implements the SortedDiskMapIterator interface.
func (i *pebbleMapIterator) Close() {
        _ = i.iter.Close()
}

// Put implements the SortedDiskMapBatchWriter interface.
func (b *pebbleMapBatchWriter) Put(k []byte, v []byte) error {
        key := b.makeKey(k)
        if err := b.batch.Set(key, v, nil); err != nil {
                return err
        }
        b.numPutsSinceFlush++
        if len(b.batch.Repr()) >= b.capacity {
                return b.Flush()
        }
        return nil
}

// Flush implements the SortedDiskMapBatchWriter interface.
func (b *pebbleMapBatchWriter) Flush() error {
        if err := b.batch.Commit(pebble.NoSync); err != nil {
                return err
        }
        b.onFlush()
        return nil
}

// NumPutsSinceFlush implements the SortedDiskMapBatchWriter interface.
func (b *pebbleMapBatchWriter) NumPutsSinceFlush() int {
        return b.numPutsSinceFlush
}

// Close implements the SortedDiskMapBatchWriter interface.
func (b *pebbleMapBatchWriter) Close(ctx context.Context) error {
        err := b.Flush()
        if err != nil {
                return err
        }
        return b.batch.Close()
}

// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"
        "time"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/iterutil"
        "github.com/cockroachdb/cockroach/pkg/util/log"
        "github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/cockroach/pkg/util/uuid"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
        "github.com/cockroachdb/pebble/rangekey"
        "github.com/cockroachdb/pebble/vfs"
        "github.com/cockroachdb/redact"
        prometheusgo "github.com/prometheus/client_model/go"
)

// SimpleMVCCIterator is an interface for iterating over key/value pairs in an
// engine. SimpleMVCCIterator implementations are thread safe unless otherwise
// noted. SimpleMVCCIterator is a subset of the functionality offered by
// MVCCIterator.
//
// API invariants are asserted via assertSimpleMVCCIteratorInvariants().
//
// The iterator exposes both point keys and range keys. Range keys are only
// emitted when enabled via IterOptions.KeyTypes. Currently, all range keys are
// MVCC range tombstones, and this is enforced during writes.
//
// Range keys and point keys exist separately in Pebble. A specific key position
// can have both a point key and multiple range keys overlapping it. Their
// properties are accessed via:
//
// HasPointAndRange(): Key types present at the current position.
// UnsafeKey():        Current position (point key if any).
// UnsafeValue():      Current point key value (if any).
// RangeBounds():      Start,end bounds of range keys at current position.
// RangeKeys():        All range keys/values overlapping current position.
//
// Consider the following point keys and range keys:
//
//        4: a4  b4
//        3: [-------)
//        2: [-------)
//        1:     b1  c1
//           a   b   c
//
// Range keys cover a span between two roachpb.Key bounds (start inclusive, end
// exclusive) and contain timestamp/value pairs. They overlap *all* point key
// versions within their key bounds regardless of timestamp. For example, when
// the iterator is positioned on b@4, it will also expose [a-c)@3 and [a-c)@2.
//
// During iteration with IterKeyTypePointsAndRanges, range keys are emitted at
// their start key and at every overlapping point key. For example, iterating
// across the above span would emit this sequence:
//
// UnsafeKey HasPointAndRange UnsafeValue RangeKeys
// a         false,true       -           [a-c)@3 [a-c)@2
// a@4       true,true        a4          [a-c)@3 [a-c)@2
// b@4       true,true        b4          [a-c)@3 [a-c)@2
// b@1       true,true        b1          [a-c)@3 [a-c)@2
// c@1       true,false       c1          -
//
// MVCCIterator reverse iteration yields the above sequence in reverse.
// Notably, bare range keys are still emitted at their start key (not end key),
// so they will be emitted last in this example.
//
// When using SeekGE within range key bounds, the iterator may land on the bare
// range key first, unless seeking exactly to an existing point key. E.g.:
//
// SeekGE UnsafeKey HasPointAndRange UnsafeValue RangeKeys
// b      b         false,true       -           [a-c)@3 [a-c)@2
// b@5    b@5       false,true       -           [a-c)@3 [a-c)@2
// b@4    b@4       true,true        b@4         [a-c)@3 [a-c)@2
// b@3    b@3       false,true       -           [a-c)@3 [a-c)@2
//
// Note that intents (with timestamp 0) encode to a bare roachpb.Key, so they
// will be colocated with a range key start bound. For example, if there was an
// intent on a in the above example, then both SeekGE(a) and forward iteration
// would land on a@0 and [a-c)@3,[a-c)@2 simultaneously, instead of the bare
// range keys first.
//
// Range keys do not have a stable, discrete identity, and should be
// considered a continuum: they may be merged or fragmented by other range key
// writes, split and merged along with CRDB ranges, partially removed by GC,
// and truncated by iterator bounds.
//
// Range keys are fragmented by Pebble such that all overlapping range keys
// between two keys form a stack of range key fragments at different timestamps.
// For example, writing [a-e)@1 and [c-g)@2 will yield this fragment structure:
//
//        2:     |---|---|
//        1: |---|---|
//           a   c   e   g
//
// Fragmentation makes all range key properties local, which avoids incurring
// unnecessary access costs across SSTs and CRDB ranges. It is deterministic
// on the current range key state, and does not depend on write history.
// Stacking allows easy access to all range keys overlapping a point key.
//
// For more information on MVCC range keys, see this tech note:
// https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/mvcc-range-tombstones.md
type SimpleMVCCIterator interface {
        // Close frees up resources held by the iterator.
        Close()
        // SeekGE advances the iterator to the first key in the engine which is >= the
        // provided key. This may be in the middle of a bare range key straddling the
        // seek key.
        SeekGE(key MVCCKey)
        // Valid must be called after any call to Seek(), Next(), Prev(), or
        // similar methods. It returns (true, nil) if the iterator points to
        // a valid key (it is undefined to call Key(), Value(), or similar
        // methods unless Valid() has returned (true, nil)). It returns
        // (false, nil) if the iterator has moved past the end of the valid
        // range, or (false, err) if an error has occurred. Valid() will
        // never return true with a non-nil error.
        Valid() (bool, error)
        // Next advances the iterator to the next key in the iteration. After this
        // call, Valid() will be true if the iterator was not positioned at the last
        // key.
        Next()
        // NextKey advances the iterator to the next MVCC key. This operation is
        // distinct from Next which advances to the next version of the current key
        // or the next key if the iterator is currently located at the last version
        // for a key. NextKey must not be used to switch iteration direction from
        // reverse iteration to forward iteration.
        //
        // If NextKey() lands on a bare range key, it is possible that there exists a
        // versioned point key at the start key too. Calling NextKey() again would
        // skip over this point key, since the start key was already emitted. If the
        // caller wants to see it, it must call Next() to check for it. Note that
        // this is not the case with intents: they don't have a timestamp, so the
        // encoded key is identical to the range key's start bound, and they will
        // be emitted together at that position.
        NextKey()
        // UnsafeKey returns the current key position. This may be a point key, or
        // the current position inside a range key (typically the start key
        // or the seek key when using SeekGE within its bounds).
        //
        // The memory is invalidated on the next call to {Next,NextKey,Prev,SeekGE,
        // SeekLT,Close}. Use Key() if this is undesirable.
        UnsafeKey() MVCCKey
        // UnsafeValue returns the current point key value as a byte slice.
        // This must only be called when it is known that the iterator is positioned
        // at a point value, i.e. HasPointAndRange has returned (true, *). If
        // possible, use MVCCValueLenAndIsTombstone() instead.
        //
        // The memory is invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}.
        // Use Value() if that is undesirable.
        UnsafeValue() ([]byte, error)
        // MVCCValueLenAndIsTombstone should be called only for MVCC (i.e.,
        // UnsafeKey().IsValue()) point values, when the actual point value is not
        // needed, for example when updating stats and making GC decisions, and it
        // is sufficient for the caller to know the length (len(UnsafeValue()), and
        // whether the underlying MVCCValue is a tombstone
        // (MVCCValue.IsTombstone()). This is an optimization that can allow the
        // underlying storage layer to avoid retrieving the value.
        // REQUIRES: HasPointAndRange() has returned (true, *).
        MVCCValueLenAndIsTombstone() (int, bool, error)
        // ValueLen can be called for MVCC or non-MVCC values, when only the value
        // length is needed. This is an optimization that can allow the underlying
        // storage layer to avoid retrieving the value.
        // REQUIRES: HasPointAndRange() has returned (true, *).
        ValueLen() int
        // HasPointAndRange returns whether the current iterator position has a point
        // key and/or a range key. Must check Valid() first. At least one of these
        // will always be true for a valid iterator. For details on range keys, see
        // comment on SimpleMVCCIterator.
        HasPointAndRange() (bool, bool)
        // RangeBounds returns the range bounds for the current range key, or an
        // empty span if there are none. The returned keys are valid until the
        // range key changes, see RangeKeyChanged().
        RangeBounds() roachpb.Span
        // RangeKeys returns a stack of all range keys (with different timestamps) at
        // the current key position. When at a point key, it will return all range
        // keys overlapping that point key. The stack is valid until the range key
        // changes, see RangeKeyChanged().
        //
        // For details on range keys, see SimpleMVCCIterator comment, or tech note:
        // https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/mvcc-range-tombstones.md
        RangeKeys() MVCCRangeKeyStack
        // RangeKeyChanged returns true if the previous seek or step moved to a
        // different range key (or none at all). Requires a valid iterator, but an
        // exhausted iterator is considered to have had no range keys when calling
        // this after repositioning.
        RangeKeyChanged() bool
}

// IteratorStats is returned from {MVCCIterator,EngineIterator}.Stats.
type IteratorStats struct {
        // Iteration stats. We directly expose pebble.IteratorStats. Callers
        // may want to aggregate and interpret these in the following manner:
        // - Aggregate {Forward,Reverse}SeekCount, {Forward,Reverse}StepCount.
        // - Interpret the four aggregated stats as follows:
        //   - {SeekCount,StepCount}[InterfaceCall]: We can refer to these simply as
        //     {SeekCount,StepCount} in logs/metrics/traces. These represents
        //     explicit calls by the implementation of MVCCIterator, in response to
        //     the caller of MVCCIterator. A high count relative to the unique MVCC
        //     keys returned suggests there are many versions for the same key.
        //   - {SeekCount,StepCount}[InternalIterCall]: We can refer to these simply
        //     as {InternalSeekCount,InternalStepCount}. If these are significantly
        //     larger than the ones in the preceding bullet, it suggests that there
        //     are lots of uncompacted deletes or stale Pebble-versions (not to be
        //     confused with MVCC versions) that need to be compacted away. This
        //     should be very rare, but has been observed.
        Stats pebble.IteratorStats
}

// MVCCIterator is an interface for iterating over key/value pairs in an
// engine. It is used for iterating over the key space that can have multiple
// versions, and if often also used (due to historical reasons) for iterating
// over the key space that never has multiple versions (i.e.,
// MVCCKey.Timestamp.IsEmpty()).
//
// MVCCIterator implementations are thread safe unless otherwise noted. API
// invariants are asserted via assertMVCCIteratorInvariants().
//
// For details on range keys and iteration, see comment on SimpleMVCCIterator.
type MVCCIterator interface {
        SimpleMVCCIterator

        // SeekLT advances the iterator to the first key in the engine which is < the
        // provided key. Unlike SeekGE, when calling SeekLT within range key bounds
        // this will not land on the seek key, but rather on the closest point key
        // overlapping the range key or the range key's start bound.
        SeekLT(key MVCCKey)
        // Prev moves the iterator backward to the previous key in the iteration.
        // After this call, Valid() will be true if the iterator was not positioned at
        // the first key.
        Prev()

        // UnsafeRawKey returns the current raw key which could be an encoded
        // MVCCKey, or the more general EngineKey (for a lock table key).
        // This is a low-level and dangerous method since it will expose the
        // raw key of the lock table, i.e., the intentInterleavingIter will not
        // hide the difference between interleaved and separated intents.
        // Callers should be very careful when using this. This is currently
        // only used by callers who are iterating and deleting all data in a
        // range.
        UnsafeRawKey() []byte
        // UnsafeRawMVCCKey returns a serialized MVCCKey. The memory is invalidated
        // on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. If the
        // iterator is currently positioned at a separated intent (when
        // intentInterleavingIter is used), it makes that intent look like an
        // interleaved intent key, i.e., an MVCCKey with an empty timestamp. This is
        // currently used by callers who pass around key information as a []byte --
        // this seems avoidable, and we should consider cleaning up the callers.
        UnsafeRawMVCCKey() []byte
        // Value is like UnsafeValue, but returns memory owned by the caller.
        Value() ([]byte, error)
        // ValueProto unmarshals the value the iterator is currently
        // pointing to using a protobuf decoder.
        ValueProto(msg protoutil.Message) error
        // FindSplitKey finds a key from the given span such that the left side of the
        // split is roughly targetSize bytes. It only considers MVCC point keys, not
        // range keys. The returned key will never be chosen from the key ranges
        // listed in keys.NoSplitSpans and will always sort equal to or after
        // minSplitKey.
        //
        // DO NOT CALL directly (except in wrapper MVCCIterator implementations). Use the
        // package-level MVCCFindSplitKey instead. For correct operation, the caller
        // must set the upper bound on the iterator before calling this method.
        FindSplitKey(start, end, minSplitKey roachpb.Key, targetSize int64) (MVCCKey, error)
        // Stats returns statistics about the iterator.
        Stats() IteratorStats
        // IsPrefix returns true if the MVCCIterator is a prefix iterator, i.e.
        // created with IterOptions.Prefix enabled.
        IsPrefix() bool
        // UnsafeLazyValue is only for use inside the storage package. It exposes
        // the LazyValue at the current iterator position, and hence delays fetching
        // the actual value. It is exposed for reverse scans that need to search for
        // the most recent relevant version, and can't know whether the current
        // value is that version, and need to step back to make that determination.
        //
        // REQUIRES: Valid() returns true.
        UnsafeLazyValue() pebble.LazyValue
}

// EngineIterator is an iterator over key-value pairs where the key is
// an EngineKey.
type EngineIterator interface {
        // Close frees up resources held by the iterator.
        Close()
        // SeekEngineKeyGE advances the iterator to the first key in the engine
        // which is >= the provided key.
        SeekEngineKeyGE(key EngineKey) (valid bool, err error)
        // SeekEngineKeyLT advances the iterator to the first key in the engine
        // which is < the provided key.
        SeekEngineKeyLT(key EngineKey) (valid bool, err error)
        // NextEngineKey advances the iterator to the next key/value in the
        // iteration. After this call, valid will be true if the iterator was not
        // originally positioned at the last key. Note that unlike
        // MVCCIterator.NextKey, this method does not skip other versions with the
        // same EngineKey.Key.
        // TODO(sumeer): change MVCCIterator.Next() to match the
        // return values, change all its callers, and rename this
        // to Next().
        NextEngineKey() (valid bool, err error)
        // PrevEngineKey moves the iterator backward to the previous key/value in
        // the iteration. After this call, valid will be true if the iterator was
        // not originally positioned at the first key.
        PrevEngineKey() (valid bool, err error)
        // HasPointAndRange returns whether the iterator is positioned on a point or
        // range key (shared with MVCCIterator interface).
        HasPointAndRange() (bool, bool)
        // EngineRangeBounds returns the current range key bounds.
        EngineRangeBounds() (roachpb.Span, error)
        // EngineRangeKeys returns the engine range keys at the current position.
        EngineRangeKeys() []EngineRangeKeyValue
        // RangeKeyChanged returns true if the previous seek or step moved to a
        // different range key (or none at all). This includes an exhausted iterator.
        RangeKeyChanged() bool
        // UnsafeEngineKey returns the same value as EngineKey, but the memory is
        // invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}.
        // REQUIRES: latest positioning function returned valid=true.
        UnsafeEngineKey() (EngineKey, error)
        // EngineKey returns the current key.
        // REQUIRES: latest positioning function returned valid=true.
        EngineKey() (EngineKey, error)
        // UnsafeRawEngineKey returns the current raw (encoded) key corresponding to
        // EngineKey. This is a low-level method and callers should avoid using
        // it. This is currently only used by intentInterleavingIter to implement
        // UnsafeRawKey.
        UnsafeRawEngineKey() []byte
        // UnsafeValue returns the same value as Value, but the memory is
        // invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}.
        // REQUIRES: latest positioning function returned valid=true.
        UnsafeValue() ([]byte, error)
        // UnsafeLazyValue is only for use inside the storage package. It exposes
        // the LazyValue at the current iterator position, and hence delays fetching
        // the actual value.
        // REQUIRES: latest positioning function returned valid=true.
        UnsafeLazyValue() pebble.LazyValue
        // Value returns the current value as a byte slice.
        // REQUIRES: latest positioning function returned valid=true.
        Value() ([]byte, error)
        // ValueLen returns the length of the current value. ValueLen should be
        // preferred when the actual value is not needed. In some circumstances, the
        // storage engine may be able to avoid loading the value.
        // REQUIRES: latest positioning function returned valid=true.
        ValueLen() int
        // CloneContext is a low-level method only for use in the storage package,
        // that provides sufficient context that the iterator may be cloned.
        CloneContext() CloneContext
        // SeekEngineKeyGEWithLimit is similar to SeekEngineKeyGE, but takes an
        // additional exclusive upper limit parameter. The limit is semantically
        // best-effort, and is an optimization to avoid O(n^2) iteration behavior in
        // some pathological situations (uncompacted deleted locks).
        SeekEngineKeyGEWithLimit(key EngineKey, limit roachpb.Key) (state pebble.IterValidityState, err error)
        // SeekEngineKeyLTWithLimit is similar to SeekEngineKeyLT, but takes an
        // additional inclusive lower limit parameter. The limit is semantically
        // best-effort, and is an optimization to avoid O(n^2) iteration behavior in
        // some pathological situations (uncompacted deleted locks).
        SeekEngineKeyLTWithLimit(key EngineKey, limit roachpb.Key) (state pebble.IterValidityState, err error)
        // NextEngineKeyWithLimit is similar to NextEngineKey, but takes an
        // additional exclusive upper limit parameter. The limit is semantically
        // best-effort, and is an optimization to avoid O(n^2) iteration behavior in
        // some pathological situations (uncompacted deleted locks).
        NextEngineKeyWithLimit(limit roachpb.Key) (state pebble.IterValidityState, err error)
        // PrevEngineKeyWithLimit is similar to PrevEngineKey, but takes an
        // additional inclusive lower limit parameter. The limit is semantically
        // best-effort, and is an optimization to avoid O(n^2) iteration behavior in
        // some pathological situations (uncompacted deleted locks).
        PrevEngineKeyWithLimit(limit roachpb.Key) (state pebble.IterValidityState, err error)
        // Stats returns statistics about the iterator.
        Stats() IteratorStats
}

// CloneContext is an opaque type encapsulating sufficient context to construct
// a clone of an existing iterator.
type CloneContext struct {
        rawIter pebbleiter.Iterator
        engine  *Pebble
}

// IterOptions contains options used to create an {MVCC,Engine}Iterator.
//
// For performance, every {MVCC,Engine}Iterator must specify either Prefix or
// UpperBound.
type IterOptions struct {
        // If Prefix is true, Seek will use the user-key prefix of the supplied
        // {MVCC,Engine}Key (the Key field) to restrict which sstables are searched,
        // but iteration (using Next) over keys without the same user-key prefix
        // will not work correctly (keys may be skipped).
        Prefix bool
        // LowerBound gives this iterator an inclusive lower bound. Attempts to
        // SeekReverse or Prev to a key that is strictly less than the bound will
        // invalidate the iterator.
        LowerBound roachpb.Key
        // UpperBound gives this iterator an exclusive upper bound. Attempts to Seek
        // or Next to a key that is greater than or equal to the bound will invalidate
        // the iterator. UpperBound must be provided unless Prefix is true, in which
        // case the end of the prefix will be used as the upper bound.
        UpperBound roachpb.Key
        // MinTimestamp and MaxTimestamp, if set, indicate that only keys
        // within the time range formed by [MinTimestamp, MaxTimestamp] should be
        // returned. The underlying iterator may be able to efficiently skip over
        // keys outside of the hinted time range, e.g., when a block handle
        // indicates that the block contains no keys within the time range. Intents
        // will not be visible to such iterators at all. This is only relevant for
        // MVCCIterators.
        //
        // Note that time-bound iterators previously were only a performance
        // optimization but now guarantee that no keys outside of the [start, end]
        // time range will be returned.
        //
        // NB: Range keys are not currently subject to timestamp filtering due to
        // complications with MVCCIncrementalIterator. See:
        // https://github.com/cockroachdb/cockroach/issues/86260
        MinTimestamp, MaxTimestamp hlc.Timestamp
        // KeyTypes specifies the types of keys to surface: point and/or range keys.
        // Use HasPointAndRange() to determine which key type is present at a given
        // iterator position, and RangeBounds() and RangeKeys() to access range keys.
        // Defaults to IterKeyTypePointsOnly. For more details on range keys, see
        // comment on SimpleMVCCIterator.
        KeyTypes IterKeyType
        // RangeKeyMaskingBelow enables masking (hiding) of point keys by range keys.
        // Any range key with a timestamp at or below RangeKeyMaskingBelow
        // will mask point keys below it, preventing them from being surfaced.
        // Consider the following example:
        //
        // 4          o---------------o    RangeKeyMaskingBelow=4 emits b3
        // 3      b3      d3               RangeKeyMaskingBelow=3 emits b3,d3,f2
        // 2  o---------------o   f2       RangeKeyMaskingBelow=2 emits b3,d3,f2
        // 1  a1  b1          o-------o    RangeKeyMaskingBelow=1 emits a1,b3,b1,d3,f2
        //    a   b   c   d   e   f   g
        //
        // Range keys themselves are not affected by the masking, and will be
        // emitted as normal.
        RangeKeyMaskingBelow hlc.Timestamp
        // ReadCategory is used to map to a user-understandable category string, for
        // stats aggregation and metrics, and a Pebble-understandable QoS.
        ReadCategory fs.ReadCategory
        // useL6Filters allows the caller to opt into reading filter blocks for
        // L6 sstables. Only for use with Prefix = true. Helpful if a lot of prefix
        // Seeks are expected in quick succession, that are also likely to not
        // yield a single key. Filter blocks in L6 can be relatively large, often
        // larger than data blocks, so the benefit of loading them in the cache
        // is minimized if the probability of the key existing is not low or if
        // this is a one-time Seek (where loading the data block directly is better).
        useL6Filters bool
}

// IterKeyType configures which types of keys an iterator should surface.
//
// TODO(erikgrinaker): Combine this with MVCCIterKind somehow.
type IterKeyType = pebble.IterKeyType

const (
        // IterKeyTypePointsOnly iterates over point keys only.
        IterKeyTypePointsOnly = pebble.IterKeyTypePointsOnly
        // IterKeyTypePointsAndRanges iterates over both point and range keys.
        IterKeyTypePointsAndRanges = pebble.IterKeyTypePointsAndRanges
        // IterKeyTypeRangesOnly iterates over only range keys.
        IterKeyTypeRangesOnly = pebble.IterKeyTypeRangesOnly
)

// MVCCIterKind is used to inform Reader about the kind of iteration desired
// by the caller.
type MVCCIterKind int

// "Intent" refers to non-inline meta, that can be interleaved or separated.
const (
        // MVCCKeyAndIntentsIterKind specifies that intents must be seen, and appear
        // interleaved with keys, even if they are in a separated lock table.
        // Iterators of this kind are not allowed to span from local to global keys,
        // since the physical layout has the separated lock table in-between the
        // local and global keys. These iterators do strict error checking and panic
        // if the caller seems that to be trying to violate this constraint.
        // Specifically:
        // - If both bounds are set they must not span from local to global.
        // - Any bound (lower or upper), constrains the iterator for its lifetime to
        //   one of local or global keys. The iterator will not tolerate a seek that
        //   violates this constraint.
        // We could, with significant code complexity, not constrain an iterator for
        // its lifetime, and allow a seek that specifies a global (local) key to
        // change the constraint to global (local). This would allow reuse of the
        // same iterator with a large global upper-bound. But a Next call on the
        // highest local key (Prev on the lowest global key) would still not be able
        // to transparently skip over the intermediate lock table. We deem that
        // behavior to be more surprising and bug-prone (for the caller), than being
        // strict.
        MVCCKeyAndIntentsIterKind MVCCIterKind = iota
        // MVCCKeyIterKind specifies that the caller does not need to see intents.
        // Any interleaved intents may be seen, but no correctness properties are
        // derivable from such partial knowledge of intents. NB: this is a performance
        // optimization when iterating over (a) MVCC keys where the caller does
        // not need to see intents, (b) a key space that is known to not have multiple
        // versions (and therefore will never have intents), like the raft log.
        MVCCKeyIterKind
)

// Reader is the read interface to an engine's data. Certain implementations
// of Reader guarantee consistency of the underlying engine state across the
// different iterators created by NewMVCCIterator, NewEngineIterator:
//   - pebbleSnapshot, because it uses an engine snapshot.
//   - pebbleReadOnly, pebbleBatch: when the IterOptions do not specify a
//     timestamp hint (see IterOptions). Note that currently the engine state
//     visible here is not as of the time of the Reader creation. It is the time
//     when the first iterator is created, or earlier if
//     PinEngineStateForIterators is called.
//
// The ConsistentIterators method returns true when this consistency is
// guaranteed by the Reader.
// TODO(sumeer): this partial consistency can be a source of bugs if future
// code starts relying on it, but rarely uses a Reader that does not guarantee
// it. Can we enumerate the current cases where KV uses Engine as a Reader?
type Reader interface {
        // Close closes the reader, freeing up any outstanding resources. Note that
        // various implementations have slightly different behaviors. In particular,
        // Distinct() batches release their parent batch for future use while
        // Engines, Snapshots and Batches free the associated C++ resources.
        Close()
        // Closed returns true if the reader has been closed or is not usable.
        // Objects backed by this reader (e.g. Iterators) can check this to ensure
        // that they are not using a closed engine. Intended for use within package
        // engine; exported to enable wrappers to exist in other packages.
        Closed() bool
        // MVCCIterate scans from the start key to the end key (exclusive), invoking
        // the function f on each key value pair. The inputs are copies, and safe to
        // retain beyond the function call. It supports interleaved iteration over
        // point and/or range keys, providing any overlapping range keys for each
        // point key if requested. If f returns an error or if the scan itself
        // encounters an error, the iteration will stop and return the error.
        //
        // Note that this method is not expected take into account the timestamp of
        // the end key; all MVCCKeys at end.Key are considered excluded in the
        // iteration.
        MVCCIterate(
                ctx context.Context, start, end roachpb.Key, iterKind MVCCIterKind, keyTypes IterKeyType,
                readCategory fs.ReadCategory, f func(MVCCKeyValue, MVCCRangeKeyStack) error,
        ) error
        // NewMVCCIterator returns a new instance of an MVCCIterator over this engine.
        // The caller must invoke Close() on it when done to free resources.
        //
        // Write visibility semantics:
        //
        // 1. An iterator has a consistent view of the reader as of the time of its
        //    creation. Subsequent writes are never visible to it.
        //
        // 2. All iterators on readers with ConsistentIterators=true have a consistent
        //    view of the _engine_ (not reader) as of the time of the first iterator
        //    creation or PinEngineStateForIterators call: newer engine writes are
        //    never visible. The opposite holds for ConsistentIterators=false: new
        //    iterators see the most recent engine state at the time of their creation.
        //
        // 3. Iterators on unindexed batches never see batch writes, but satisfy
        //    ConsistentIterators for engine write visibility.
        //
        // 4. Iterators on indexed batches see all batch writes as of their creation
        //    time, but they satisfy ConsistentIterators for engine writes.
        NewMVCCIterator(
                ctx context.Context, iterKind MVCCIterKind, opts IterOptions) (MVCCIterator, error)
        // NewEngineIterator returns a new instance of an EngineIterator over this
        // engine. The caller must invoke EngineIterator.Close() when finished
        // with the iterator to free resources. The caller can change IterOptions
        // after this function returns.
        NewEngineIterator(ctx context.Context, opts IterOptions) (EngineIterator, error)
        // ScanInternal allows a caller to inspect the underlying engine's InternalKeys
        // using a visitor pattern, while also allowing for keys in shared files to be
        // skipped if a visitor is provided for visitSharedFiles. Useful for
        // fast-replicating state from one Reader to another. Point keys are collapsed
        // such that only one internal key per user key is exposed, and rangedels and
        // range keys are collapsed and defragmented with each span being surfaced
        // exactly once, alongside the highest seqnum for a rangedel on that span
        // (for rangedels) or all coalesced rangekey.Keys in that span (for range
        // keys). A point key deleted by a rangedel will not be exposed, but the
        // rangedel would be exposed.
        //
        // Note that ScanInternal does not obey the guarantees indicated by
        // ConsistentIterators.
        ScanInternal(
                ctx context.Context, lower, upper roachpb.Key,
                visitPointKey func(key *pebble.InternalKey, value pebble.LazyValue, info pebble.IteratorLevel) error,
                visitRangeDel func(start, end []byte, seqNum pebble.SeqNum) error,
                visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
                visitSharedFile func(sst *pebble.SharedSSTMeta) error,
                visitExternalFile func(sst *pebble.ExternalFile) error,
        ) error
        // ConsistentIterators returns true if the Reader implementation guarantees
        // that the different iterators constructed by this Reader will see the same
        // underlying Engine state. This is not true about Batch writes: new iterators
        // will see new writes made to the batch, existing iterators won't.
        ConsistentIterators() bool

        // PinEngineStateForIterators ensures that the state seen by iterators
        // without timestamp hints (see IterOptions) is pinned and will not see
        // future mutations. It can be called multiple times on a Reader in which
        // case the state seen will be either:
        // - As of the first call.
        // - For a Reader returned by Engine.NewSnapshot, the pinned state is as of
        //   the time the snapshot was taken.
        // So the semantics that are true for all Readers is that the pinned state
        // is somewhere in the time interval between the creation of the Reader and
        // the first call to PinEngineStateForIterators.
        // REQUIRES: ConsistentIterators returns true.
        PinEngineStateForIterators(readCategory fs.ReadCategory) error
}

// EventuallyFileOnlyReader is a specialized Reader that supports a method to
// wait on a transition to being a file-only reader that does not pin any
// keys in-memory.
type EventuallyFileOnlyReader interface {
        Reader
        // WaitForFileOnly blocks the calling goroutine until this reader has
        // transitioned to a file-only reader that does not pin any in-memory state.
        // If an error is returned, this transition did not succeed. The Duration
        // argument specifies how long to wait for before attempting a flush to
        // force a transition to a file-only snapshot.
        WaitForFileOnly(ctx context.Context, gracePeriodBeforeFlush time.Duration) error
}

// Writer is the write interface to an engine's data.
type Writer interface {
        // ApplyBatchRepr atomically applies a set of batched updates. Created by
        // calling Repr() on a batch. Using this method is equivalent to constructing
        // and committing a batch whose Repr() equals repr. If sync is true, the batch
        // is synchronously flushed to the OS and written to disk. It is an error to
        // specify sync=true if the Writer is a Batch.
        //
        // It is safe to modify the contents of the arguments after ApplyBatchRepr
        // returns.
        ApplyBatchRepr(repr []byte, sync bool) error

        // ClearMVCC removes the point key with the given MVCCKey from the db. It does
        // not affect range keys. It requires that the timestamp is non-empty (see
        // ClearUnversioned or ClearIntent if the timestamp is empty). Note that clear
        // actually removes entries from the storage engine, rather than inserting
        // MVCC tombstones.
        //
        // If the caller knows the size of the value that is being cleared, they
        // should set ClearOptions.{ValueSizeKnown, ValueSize} accordingly to
        // improve the storage engine's ability to prioritize compactions.
        //
        // It is safe to modify the contents of the arguments after it returns.
        ClearMVCC(key MVCCKey, opts ClearOptions) error
        // ClearUnversioned removes an unversioned item from the db. It is for use
        // with inline metadata (not intents) and other unversioned keys (like
        // Range-ID local keys). It does not affect range keys.
        //
        // If the caller knows the size of the value that is being cleared, they
        // should set ClearOptions.{ValueSizeKnown, ValueSize} accordingly to
        // improve the storage engine's ability to prioritize compactions.
        //
        // It is safe to modify the contents of the arguments after it returns.
        ClearUnversioned(key roachpb.Key, opts ClearOptions) error
        // ClearEngineKey removes the given point key from the engine. It does not
        // affect range keys.  Note that clear actually removes entries from the
        // storage engine. This is a general-purpose and low-level method that should
        // be used sparingly, only when the other Clear* methods are not applicable.
        //
        // If the caller knows the size of the value that is being cleared, they
        // should set ClearOptions.{ValueSizeKnown, ValueSize} accordingly to
        // improve the storage engine's ability to prioritize compactions.
        //
        // It is safe to modify the contents of the arguments after it returns.
        ClearEngineKey(key EngineKey, opts ClearOptions) error

        // ClearRawRange removes point and/or range keys from start (inclusive) to end
        // (exclusive) using Pebble range tombstones. It can be applied to a range
        // consisting of MVCCKeys or the more general EngineKeys -- it simply uses the
        // roachpb.Key parameters as the Key field of an EngineKey. This implies that
        // it does not clear intents unless the intent lock table is targeted
        // explicitly.
        //
        // Similar to the other Clear* methods, this method actually removes entries
        // from the storage engine. It is safe to modify the contents of the arguments
        // after it returns.
        ClearRawRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error
        // ClearMVCCRange removes MVCC point and/or range keys (including intents)
        // from start (inclusive) to end (exclusive) using Pebble range tombstones.
        //
        // Similar to the other Clear* methods, this method actually removes entries
        // from the storage engine. It is safe to modify the contents of the arguments
        // after it returns.
        ClearMVCCRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error
        // ClearMVCCVersions removes MVCC point key versions from start (inclusive) to
        // end (exclusive) using a Pebble range tombstone. It is meant for efficiently
        // clearing a subset of versions of a key, since the parameters are MVCCKeys
        // and not roachpb.Keys, but it can also be used across multiple keys. It will
        // ignore intents and range keys, leaving them in place.
        //
        // Similar to the other Clear* methods, this method actually removes entries
        // from the storage engine. It is safe to modify the contents of the arguments
        // after it returns.
        ClearMVCCVersions(start, end MVCCKey) error
        // ClearMVCCIteratorRange removes all point and/or range keys in the given
        // span using an MVCC iterator, by clearing individual keys (including
        // intents).
        //
        // Similar to the other Clear* methods, this method actually removes entries
        // from the storage engine. It is safe to modify the contents of the arguments
        // after it returns.
        //
        // TODO(erikgrinaker): This should be a separate function rather than an
        // interface method, but we keep it for now to make use of UnsafeRawKey() when
        // clearing keys.
        ClearMVCCIteratorRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error

        // ClearMVCCRangeKey deletes an MVCC range key from start (inclusive) to end
        // (exclusive) at the given timestamp. For any range key that straddles the
        // start and end boundaries, only the segments within the boundaries will be
        // cleared. Range keys at other timestamps are unaffected.  Clears are
        // idempotent.
        //
        // This method is primarily intended for MVCC garbage collection and similar
        // internal use.
        ClearMVCCRangeKey(rangeKey MVCCRangeKey) error

        // PutMVCCRangeKey writes an MVCC range key. It will replace any overlapping
        // range keys at the given timestamp (even partial overlap). Only MVCC range
        // tombstones, i.e. an empty value, are currently allowed (other kinds will
        // need additional handling in MVCC APIs and elsewhere, e.g. stats and GC).
        //
        // Range keys must be accessed using special iterator options and methods,
        // see SimpleMVCCIterator.RangeKeys() for details.
        //
        // For more information on MVCC range keys, see this tech note:
        // https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/mvcc-range-tombstones.md
        PutMVCCRangeKey(MVCCRangeKey, MVCCValue) error

        // PutRawMVCCRangeKey is like PutMVCCRangeKey, but accepts an encoded
        // MVCCValue. It can be used to avoid decoding and immediately re-encoding an
        // MVCCValue, but should generally be avoided due to the lack of type safety.
        //
        // It is safe to modify the contents of the arguments after PutRawMVCCRangeKey
        // returns.
        PutRawMVCCRangeKey(MVCCRangeKey, []byte) error

        // PutEngineRangeKey sets the given range key to the values provided. This is
        // a general-purpose and low-level method that should be used sparingly, only
        // when the other Put* methods are not applicable.
        //
        // It is safe to modify the contents of the arguments after it returns.
        PutEngineRangeKey(start, end roachpb.Key, suffix, value []byte) error

        // ClearEngineRangeKey clears the given range key. This is a general-purpose
        // and low-level method that should be used sparingly, only when the other
        // Clear* methods are not applicable.
        //
        // It is safe to modify the contents of the arguments after it returns.
        ClearEngineRangeKey(start, end roachpb.Key, suffix []byte) error

        // Merge is a high-performance write operation used for values which are
        // accumulated over several writes. Multiple values can be merged
        // sequentially into a single key; a subsequent read will return a "merged"
        // value which is computed from the original merged values. We only
        // support Merge for keys with no version.
        //
        // Merge currently provides specialized behavior for three data types:
        // integers, byte slices, and time series observations. Merged integers are
        // summed, acting as a high-performance accumulator.  Byte slices are simply
        // concatenated in the order they are merged. Time series observations
        // (stored as byte slices with a special tag on the roachpb.Value) are
        // combined with specialized logic beyond that of simple byte slices.
        //
        //
        // It is safe to modify the contents of the arguments after Merge returns.
        Merge(key MVCCKey, value []byte) error

        // PutMVCC sets the given key to the value provided. It requires that the
        // timestamp is non-empty (see {PutUnversioned,PutIntent} if the timestamp
        // is empty).
        //
        // It is safe to modify the contents of the arguments after PutMVCC returns.
        PutMVCC(key MVCCKey, value MVCCValue) error
        // PutRawMVCC is like PutMVCC, but it accepts an encoded MVCCValue. It
        // can be used to avoid decoding and immediately re-encoding an MVCCValue,
        // but should generally be avoided due to the lack of type safety.
        //
        // It is safe to modify the contents of the arguments after PutRawMVCC
        // returns.
        PutRawMVCC(key MVCCKey, value []byte) error
        // PutUnversioned sets the given key to the value provided. It is for use
        // with inline metadata (not intents) and other unversioned keys (like
        // Range-ID local keys).
        //
        // It is safe to modify the contents of the arguments after Put returns.
        PutUnversioned(key roachpb.Key, value []byte) error
        // PutEngineKey sets the given key to the value provided. This is a
        // general-purpose and low-level method that should be used sparingly,
        // only when the other Put* methods are not applicable.
        //
        // It is safe to modify the contents of the arguments after Put returns.
        PutEngineKey(key EngineKey, value []byte) error

        // LogData adds the specified data to the RocksDB WAL. The data is
        // uninterpreted by RocksDB (i.e. not added to the memtable or sstables).
        //
        // It is safe to modify the contents of the arguments after LogData returns.
        LogData(data []byte) error
        // LogLogicalOp logs the specified logical mvcc operation with the provided
        // details to the writer, if it has logical op logging enabled. For most
        // Writer implementations, this is a no-op.
        LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails)

        // SingleClearEngineKey removes the most recent write to the item from the
        // db with the given key, using Pebble's SINGLEDEL operation. This
        // originally resembled the semantics of RocksDB
        // (https://github.com/facebook/rocksdb/wiki/Single-Delete), but was
        // strengthened in Pebble such that sequences (from more recent to older)
        // like SINGLEDEL#20, SET#17, DEL#15, ... work as intended since there has
        // been only one SET more recent than the last DEL. These also work if the
        // DEL is replaced by a RANGEDEL, since RANGEDELs are used extensively to
        // drop all the data for a replica, which may then be recreated in the
        // future. The behavior is non-deterministic and definitely not what the
        // caller wants if there are multiple SETs/MERGEs etc. immediately older
        // than the SINGLEDEL.
        //
        // Note that using SINGLEDEL requires the caller to not duplicate SETs
        // without knowing about it. That is, the caller cannot rely simply on
        // idempotent writes for correctness, if they are going to be later deleted
        // using SINGLEDEL. A current case where duplication without knowledge can
        // happen is sstable ingestion for "global" keys, say during import and
        // schema change. SSTable ingestion via the KV-layer's AddSSTable changes
        // the replicated state machine, but does not atomically update the
        // RangeAppliedState.RaftAppliedIndex, so on a node crash the SSTable
        // ingestion will be repeated due to replaying the Raft log. Hence,
        // SingleClearEngineKey must not be used for global keys e.g. do not
        // consider using it for MVCC GC.
        //
        // This operation actually removes entries from the storage engine, rather
        // than inserting MVCC tombstones. This is a low-level interface that must
        // not be called from outside the storage package. It is part of the
        // interface because there are structs that wrap Writer and implement the
        // Writer interface, that are not part of the storage package.
        //
        // It is safe to modify the contents of the arguments after it returns.
        SingleClearEngineKey(key EngineKey) error

        // ShouldWriteLocalTimestamps is only for internal use in the storage package.
        // This method is temporary, to handle the transition from clusters where not
        // all nodes understand local timestamps.
        ShouldWriteLocalTimestamps(ctx context.Context) bool

        // BufferedSize returns the size of the underlying buffered writes if the
        // Writer implementation is buffered, and 0 if the Writer implementation is
        // not buffered. Buffered writers are expected to always give a monotonically
        // increasing size.
        BufferedSize() int
}

// InternalWriter is an extension of Writer that supports additional low-level
// methods to operate on internal keys in Pebble. These additional methods
// should only be used sparingly, when one of the high-level methods cannot
// achieve the same ends.
type InternalWriter interface {
        Writer
        // ClearRawEncodedRange is similar to ClearRawRange, except it takes pre-encoded
        // start, end keys and bypasses the EngineKey encoding step. It also only
        // operates on point keys; for range keys, use ClearEngineRangeKey or
        // PutInternalRangeKey.
        //
        // It is safe to modify the contents of the arguments after it returns.
        ClearRawEncodedRange(start, end []byte) error

        // PutInternalRangeKey adds an InternalRangeKey to this batch. This is a very
        // low-level method that should be used sparingly.
        //
        // It is safe to modify the contents of the arguments after it returns.
        PutInternalRangeKey(start, end []byte, key rangekey.Key) error
        // PutInternalPointKey adds a point InternalKey to this batch. This is a very
        // low-level method that should be used sparingly.
        //
        // It is safe to modify the contents of the arguments after it returns.
        PutInternalPointKey(key *pebble.InternalKey, value []byte) error
}

// ClearOptions holds optional parameters to methods that clear keys from the
// storage engine.
type ClearOptions struct {
        // ValueSizeKnown indicates whether the ValueSize carries a meaningful
        // value. If false, ValueSize is ignored.
        ValueSizeKnown bool
        // ValueSize may be provided to indicate the size of the existing KV
        // record's value that is being removed. ValueSize should be the encoded
        // value size that the storage engine observes. If the value is a
        // MVCCMetadata, ValueSize should be the length of the encoded MVCCMetadata.
        // If the value is a MVCCValue, ValueSize should be the length of the
        // encoded MVCCValue.
        //
        // Setting ValueSize and ValueSizeKnown improves the storage engine's
        // ability to estimate space amplification and prioritize compactions.
        // Without it, compaction heuristics rely on average value sizes which are
        // susceptible to over and under estimation.
        //
        // If the true value size is unknown, leave ValueSizeKnown false.
        // Correctness is not compromised if ValueSize is incorrect; the underlying
        // key will always be cleared regardless of whether its value size matches
        // the provided value.
        ValueSize uint32
}

// ReadWriter is the read/write interface to an engine's data.
type ReadWriter interface {
        Reader
        Writer
}

// DurabilityRequirement is an advanced option. If in doubt, use
// StandardDurability.
//
// GuranteedDurability maps to pebble.IterOptions.OnlyReadGuaranteedDurable.
// This acknowledges the fact that we do not (without sacrificing correctness)
// sync the WAL for many writes, and there are some advanced cases
// (raftLogTruncator) that need visibility into what is guaranteed durable.
type DurabilityRequirement int8

const (
        // StandardDurability is what should normally be used.
        StandardDurability DurabilityRequirement = iota
        // GuaranteedDurability is an advanced option (only for raftLogTruncator).
        GuaranteedDurability
)

// Engine is the interface that wraps the core operations of a key/value store.
type Engine interface {
        Reader
        Writer
        // Attrs returns the engine/store attributes.
        Attrs() roachpb.Attributes
        // Capacity returns capacity details for the engine's available storage.
        Capacity() (roachpb.StoreCapacity, error)
        // Properties returns the low-level properties for the engine's underlying storage.
        Properties() roachpb.StoreProperties
        // Compact forces compaction over the entire database.
        Compact() error
        // Env returns the filesystem environment used by the Engine.
        Env() *fs.Env
        // Flush causes the engine to write all in-memory data to disk
        // immediately.
        Flush() error
        // GetMetrics retrieves metrics from the engine.
        GetMetrics() Metrics
        // GetEncryptionRegistries returns the file and key registries when encryption is enabled
        // on the store.
        GetEncryptionRegistries() (*fs.EncryptionRegistries, error)
        // GetEnvStats retrieves stats about the engine's environment
        // For RocksDB, this includes details of at-rest encryption.
        GetEnvStats() (*fs.EnvStats, error)
        // GetAuxiliaryDir returns a path under which files can be stored
        // persistently, and from which data can be ingested by the engine.
        //
        // Not thread safe.
        GetAuxiliaryDir() string
        // NewBatch returns a new instance of a batched engine which wraps
        // this engine. Batched engines accumulate all mutations and apply
        // them atomically on a call to Commit().
        NewBatch() Batch
        // NewReader returns a new instance of a Reader that wraps this engine, and
        // with the given durability requirement. This wrapper caches iterators to
        // avoid the overhead of creating multiple iterators for batched reads.
        //
        // All iterators created from a read-only engine are guaranteed to provide a
        // consistent snapshot of the underlying engine. See the comment on the
        // Reader interface and the Reader.ConsistentIterators method.
        NewReader(durability DurabilityRequirement) Reader
        // NewReadOnly returns a new instance of a ReadWriter that wraps this
        // engine, and with the given durability requirement. This wrapper panics
        // when unexpected operations (e.g., write operations) are executed on it
        // and caches iterators to avoid the overhead of creating multiple iterators
        // for batched reads.
        //
        // All iterators created from a read-only engine are guaranteed to provide a
        // consistent snapshot of the underlying engine. See the comment on the
        // Reader interface and the Reader.ConsistentIterators method.
        //
        // TODO(sumeer,jackson): Remove this method and force the caller to operate
        // explicitly with a separate WriteBatch and Reader.
        NewReadOnly(durability DurabilityRequirement) ReadWriter
        // NewUnindexedBatch returns a new instance of a batched engine which wraps
        // this engine. It is unindexed, in that writes to the batch are not
        // visible to reads until after it commits. The batch accumulates all
        // mutations and applies them atomically on a call to Commit().
        //
        // Reads will be satisfied by reading from the underlying engine, i.e., the
        // caller does not see its own writes. This setting should be used only when
        // the caller is certain that this optimization is correct, and beneficial.
        // There are subtleties here -- see the discussion on
        // https://github.com/cockroachdb/cockroach/pull/57661 for more details.
        //
        // TODO(sumeer,jackson): Remove this method and force the caller to operate
        // explicitly with a separate WriteBatch and Reader.
        NewUnindexedBatch() Batch
        // NewWriteBatch returns a new write batch that will commit to the
        // underlying Engine. The batch accumulates all mutations and applies them
        // atomically on a call to Commit().
        NewWriteBatch() WriteBatch
        // NewSnapshot returns a new instance of a read-only snapshot engine. A
        // snapshot provides a consistent view of the database across multiple
        // iterators. If a caller only needs a single consistent iterator, they
        // should create an iterator directly off the engine instead.
        //
        // Acquiring a snapshot is instantaneous and is inexpensive if quickly
        // released. Snapshots are released by invoking Close(). Open snapshots
        // prevent compactions from reclaiming space or removing tombstones for any
        // keys written after the snapshot is acquired. This can be problematic
        // during rebalancing or large ingestions, so they should be used sparingly
        // and briefly.
        //
        // Note that snapshots must not be used after the original engine has been
        // stopped.
        NewSnapshot() Reader
        // NewEventuallyFileOnlySnapshot returns a new instance of a read-only
        // eventually file-only snapshot. This type of snapshot incurs lower write-amp
        // than a regular Snapshot opened with NewSnapshot, however it incurs a greater
        // space-amp on disk for the duration of this snapshot's lifetime. There's
        // also a chance that its conversion to a file-only snapshot could get
        // errored out if an excise operation were to conflict with one of the passed
        // in KeyRanges. Note that if no keyRanges are passed in, a file-only snapshot
        // is created from the start; this is usually not desirable as it makes no
        // deterministic guarantees about what will be readable (anything in memtables
        // will not be visible). Snapshot guarantees are only provided for keys
        // in the passed-in keyRanges; reads are not guaranteed to be consistent
        // outside of these bounds.
        NewEventuallyFileOnlySnapshot(keyRanges []roachpb.Span) EventuallyFileOnlyReader
        // IngestLocalFiles atomically links a slice of files into the RocksDB
        // log-structured merge-tree.
        IngestLocalFiles(ctx context.Context, paths []string) error
        // IngestLocalFilesWithStats is a variant of IngestLocalFiles that
        // additionally returns ingestion stats.
        IngestLocalFilesWithStats(
                ctx context.Context, paths []string) (pebble.IngestOperationStats, error)
        // IngestAndExciseFiles is a variant of IngestLocalFilesWithStats that excises
        // an ExciseSpan, and ingests either local or shared sstables or both. It also
        // takes the flag sstsContainExciseTombstone to signal that the exciseSpan
        // contains RANGEDELs and RANGEKEYDELs.
        //
        // NB: It is the caller's responsibility to ensure if
        // sstsContainExciseTombstone is set to true, the ingestion sstables must
        // contain a tombstone for the exciseSpan.
        IngestAndExciseFiles(
                ctx context.Context,
                paths []string,
                shared []pebble.SharedSSTMeta,
                external []pebble.ExternalFile,
                exciseSpan roachpb.Span,
                sstsContainExciseTombstone bool,
        ) (pebble.IngestOperationStats, error)
        // IngestExternalFiles is a variant of IngestLocalFiles that takes external
        // files. These files can be referred to by multiple stores, but are not
        // modified or deleted by the Engine doing the ingestion.
        IngestExternalFiles(ctx context.Context, external []pebble.ExternalFile) (pebble.IngestOperationStats, error)

        // PreIngestDelay offers an engine the chance to backpressure ingestions.
        // When called, it may choose to block if the engine determines that it is in
        // or approaching a state where further ingestions may risk its health.
        PreIngestDelay(ctx context.Context)
        // ApproximateDiskBytes returns an approximation of the on-disk size and file
        // counts for the given key span, along with how many of those bytes are on
        // remote, as well as specifically external remote, storage.
        ApproximateDiskBytes(from, to roachpb.Key) (total, remote, external uint64, _ error)
        // ConvertFilesToBatchAndCommit converts local files with the given paths to
        // a WriteBatch and commits the batch with sync=true. The files represented
        // in paths must not be overlapping -- this is the same contract as
        // IngestLocalFiles*. Additionally, clearedSpans represents the spans which
        // must be deleted before writing the data contained in these paths.
        //
        // This method is expected to be used instead of IngestLocalFiles* or
        // IngestAndExciseFiles when the sum of the file sizes is small.
        //
        // TODO(sumeer): support this as an alternative to IngestAndExciseFiles.
        // This should be easy since we use NewSSTEngineIterator to read the ssts,
        // which supports multiple levels.
        ConvertFilesToBatchAndCommit(
                ctx context.Context, paths []string, clearedSpans []roachpb.Span,
        ) error
        // CompactRange ensures that the specified range of key value pairs is
        // optimized for space efficiency.
        CompactRange(start, end roachpb.Key) error
        // ScanStorageInternalKeys returns key level statistics for each level of a pebble store (that overlap start and end).
        ScanStorageInternalKeys(start, end roachpb.Key, megabytesPerSecond int64) ([]enginepb.StorageInternalKeysMetrics, error)
        // GetTableMetrics returns information about sstables that overlap start and end.
        GetTableMetrics(start, end roachpb.Key) ([]enginepb.SSTableMetricsInfo, error)
        // RegisterFlushCompletedCallback registers a callback that will be run for
        // every successful flush. Only one callback can be registered at a time, so
        // registering again replaces the previous callback. The callback must
        // return quickly and must not call any methods on the Engine in the context
        // of the callback since it could cause a deadlock (since the callback may
        // be invoked while holding mutexes).
        RegisterFlushCompletedCallback(cb func())
        // CreateCheckpoint creates a checkpoint of the engine in the given directory,
        // which must not exist. The directory should be on the same file system so
        // that hard links can be used. If spans is not empty, the checkpoint excludes
        // SSTs that don't overlap with any of these key spans.
        CreateCheckpoint(dir string, spans []roachpb.Span) error

        // MinVersion is the minimum CockroachDB version that is compatible with this
        // store. For newly created stores, this matches the currently active cluster
        // version.
        // Must never return an empty version.
        MinVersion() roachpb.Version

        // SetMinVersion is used to signal to the engine the current minimum
        // version that it must maintain compatibility with.
        SetMinVersion(version roachpb.Version) error

        // SetCompactionConcurrency is used to set the engine's compaction
        // concurrency. It returns the previous compaction concurrency.
        SetCompactionConcurrency(n uint64) uint64

        // AdjustCompactionConcurrency adjusts the compaction concurrency up or down by
        // the passed delta, down to a minimum of 1.
        AdjustCompactionConcurrency(delta int64) uint64

        // SetStoreID informs the engine of the store ID, once it is known.
        // Used to show the store ID in logs and to initialize the shared object
        // creator ID (if shared object storage is configured).
        SetStoreID(ctx context.Context, storeID int32) error

        // GetStoreID is used to retrieve the configured store ID.
        GetStoreID() (int32, error)

        // Download informs the engine to download remote files corresponding to the
        // given span. The parameter copy controls how it is downloaded -- i.e. if it
        // just copies the backing bytes to a local file of if it rewrites the file
        // key-by-key to a new file.
        Download(ctx context.Context, span roachpb.Span, copy bool) error

        // RegisterDiskSlowCallback registers a callback that will be run when a
        // write operation on the disk has been seen to be slow. This callback
        // needs to be thread-safe as it could be called repeatedly in multiple threads
        // over a short period of time.
        RegisterDiskSlowCallback(cb func(info pebble.DiskSlowInfo))

        // RegisterLowDiskSpaceCallback registers a callback that will be run when a
        // disk is running out of space. This callback needs to be thread-safe as it
        // could be called repeatedly in multiple threads over a short period of time.
        RegisterLowDiskSpaceCallback(cb func(info pebble.LowDiskSpaceInfo))

        // GetPebbleOptions returns the options used when creating the engine. The
        // caller must not modify these.
        GetPebbleOptions() *pebble.Options
}

// Batch is the interface for batch specific operations.
type Batch interface {
        // Iterators created on a batch can see some mutations performed after the
        // iterator creation. To guarantee that they see all the mutations, the
        // iterator has to be repositioned using a seek operation, after the
        // mutations were done.
        Reader
        WriteBatch
        // NewBatchOnlyMVCCIterator returns a new instance of MVCCIterator that only
        // sees the mutations in the batch (not the engine). It does not interleave
        // intents, i.e., it is of kind MVCCKeyIterKind.
        //
        // REQUIRES: the batch is indexed.
        NewBatchOnlyMVCCIterator(ctx context.Context, opts IterOptions) (MVCCIterator, error)
}

// WriteBatch is the interface for write batch specific operations.
type WriteBatch interface {
        InternalWriter
        // Close closes the batch, freeing up any outstanding resources.
        Close()
        // Commit atomically applies any batched updates to the underlying engine. If
        // sync is true, the batch is synchronously flushed to the OS and committed to
        // disk. Otherwise, this call returns before the data is even flushed to the
        // OS, and it may be lost if the process terminates.
        //
        // This is a noop unless the batch was created via NewBatch().
        Commit(sync bool) error
        // CommitNoSyncWait atomically applies any batched updates to the underlying
        // engine and initiates a disk write, but does not wait for that write to
        // complete. The caller must call SyncWait to wait for the fsync to complete.
        // The caller must not Close the Batch without first calling SyncWait.
        CommitNoSyncWait() error
        // SyncWait waits for the disk write initiated by a call to CommitNoSyncWait
        // to complete.
        SyncWait() error
        // Empty returns whether the batch has been written to or not.
        Empty() bool
        // Count returns the number of memtable-modifying operations in the batch.
        Count() uint32
        // Len returns the size of the underlying representation of the batch.
        // Because of the batch header, the size of the batch is never 0 and should
        // not be used interchangeably with Empty. The method avoids the memory copy
        // that Repr imposes, but it still may require flushing the batch's mutations.
        Len() int
        // Repr returns the underlying representation of the batch and can be used to
        // reconstitute the batch on a remote node using Writer.ApplyBatchRepr().
        Repr() []byte
        // CommitStats returns stats related to committing the batch. Should be
        // called after Batch.Commit. If CommitNoSyncWait is used, it should be
        // called after the call to SyncWait.
        CommitStats() BatchCommitStats
}

type BatchCommitStats struct {
        pebble.BatchCommitStats
}

// SafeFormat implements redact.SafeFormatter. It does not print the total
// duration.
func (stats BatchCommitStats) SafeFormat(p redact.SafePrinter, _ rune) {
        p.Printf("commit-wait %s", stats.CommitWaitDuration)
        if stats.WALQueueWaitDuration > 0 {
                p.Printf(" wal-q %s", stats.WALQueueWaitDuration)
        }
        if stats.MemTableWriteStallDuration > 0 {
                p.Printf(" mem-stall %s", stats.MemTableWriteStallDuration)
        }
        if stats.L0ReadAmpWriteStallDuration > 0 {
                p.Printf(" l0-stall %s", stats.L0ReadAmpWriteStallDuration)
        }
        if stats.WALRotationDuration > 0 {
                p.Printf(" wal-rot %s", stats.WALRotationDuration)
        }
        if stats.SemaphoreWaitDuration > 0 {
                p.Printf(" sem %s", stats.SemaphoreWaitDuration)
        }
}

// Metrics is a set of Engine metrics. Most are contained in the embedded
// *pebble.Metrics struct, which has its own documentation.
type Metrics struct {
        *pebble.Metrics
        Iterator         AggregatedIteratorStats
        BatchCommitStats AggregatedBatchCommitStats
        // DiskSlowCount counts the number of times Pebble records disk slowness.
        DiskSlowCount int64
        // DiskStallCount counts the number of times Pebble observes slow writes
        // on disk lasting longer than MaxSyncDuration (`storage.max_sync_duration`).
        DiskStallCount int64
        // SingleDelInvariantViolationCount counts the number of times a
        // SingleDelete was found to violate the invariant that it should only be
        // used when there is at most one older Set for it to delete.
        //
        // TODO(sumeer): remove, since can fire due to delete-only compactions.
        SingleDelInvariantViolationCount int64
        // SingleDelIneffectualCount counts the number of times a SingleDelete was
        // ineffectual, i.e., it was elided without deleting anything.
        //
        // TODO(sumeer): remove, since can fire due to delete-only compactions.
        SingleDelIneffectualCount int64
        // SharedStorageWriteBytes counts the number of bytes written to shared storage.
        SharedStorageWriteBytes int64
        // SharedStorageReadBytes counts the number of bytes read from shared storage.
        SharedStorageReadBytes int64
        // WriteStallCount counts the number of times Pebble intentionally delayed
        // incoming writes. Currently, the only two reasons for this to happen are:
        // - "memtable count limit reached"
        // - "L0 file count limit exceeded"
        //
        // We do not split this metric across these two reasons, but they can be
        // distinguished in the pebble logs.
        WriteStallCount    int64
        WriteStallDuration time.Duration

        // BlockLoadConcurrencyLimit is the current limit on the number of concurrent
        // sstable block reads.
        BlockLoadConcurrencyLimit int64
        // BlockLoadsInProgress is the (instantaneous) number of sstable blocks that
        // are being read from disk.
        BlockLoadsInProgress int64
        // BlockLoadsQueued is the cumulative total number of sstable block reads
        // that had to wait on the BlockLoadConcurrencyLimit.
        BlockLoadsQueued int64

        DiskWriteStats []vfs.DiskWriteStatsAggregate
}

// AggregatedIteratorStats holds cumulative stats, collected and summed over all
// of an engine's iterators.
type AggregatedIteratorStats struct {
        // BlockBytes holds the sum of sizes of all loaded blocks. If the block was
        // compressed, this is the compressed bytes. This value includes blocks that
        // were loaded from the cache, and bytes that needed to be read from
        // persistent storage.
        //
        // Currently, there may be some gaps in coverage. (At the time of writing,
        // 2nd-level index blocks are excluded.)
        BlockBytes uint64
        // BlockBytesInCache holds the subset of BlockBytes that were already in the
        // block cache, requiring no I/O.
        BlockBytesInCache uint64
        // BlockReadDuration accumulates the duration spent fetching blocks due to
        // block cache misses.
        //
        // Currently, there may be some gaps in coverage. (At the time of writing,
        // range deletion and range key blocks, meta index blocks and properties
        // blocks are all excluded.)
        BlockReadDuration time.Duration
        // ExternalSeeks is the total count of seeks in forward and backward
        // directions performed on pebble.Iterators.
        ExternalSeeks int
        // ExternalSteps is the total count of relative positioning operations (eg,
        // Nexts, Prevs, NextPrefix, NextWithLimit, etc) in forward and backward
        // directions performed on pebble.Iterators.
        ExternalSteps int
        // InternalSeeks is the total count of steps in forward and backward
        // directions performed on Pebble's internal iterator. If this is high
        // relative to ExternalSeeks, it's a good indication that there's an
        // accumulation of garbage within the LSM (NOT MVCC garbage).
        InternalSeeks int
        // InternalSteps is the total count of relative positioning operations (eg,
        // Nexts, Prevs, NextPrefix, etc) in forward and backward directions
        // performed on pebble's internal iterator. If this is high relative to
        // ExternalSteps, it's a good indication that there's an accumulation of
        // garbage within the LSM (NOT MVCC garbage).
        InternalSteps int
}

// AggregatedBatchCommitStats hold cumulative stats summed over all the
// batches that committed at the engine. Since these are durations, only the
// mean (over an interval) can be recovered. We can change some of these to
// histograms once we know which ones are more useful.
type AggregatedBatchCommitStats struct {
        Count uint64
        BatchCommitStats
}

// MetricsForInterval is a set of pebble.Metrics that need to be saved in order to
// compute metrics according to an interval. The metrics recorded here are
// cumulative values, that are used to subtract from, when the next cumulative
// values are received.
type MetricsForInterval struct {
        WALFsyncLatency                prometheusgo.Metric
        FlushWriteThroughput           pebble.ThroughputMetric
        WALFailoverWriteAndSyncLatency prometheusgo.Metric
}

// NumSSTables returns the total number of SSTables in the LSM, aggregated
// across levels.
func (m *Metrics) NumSSTables() int64 {
        var num int64
        for _, lm := range m.Metrics.Levels {
                num += lm.NumFiles
        }
        return num
}

// IngestedBytes returns the sum of all ingested tables, aggregated across all
// levels of the LSM.
func (m *Metrics) IngestedBytes() uint64 {
        var ingestedBytes uint64
        for _, lm := range m.Metrics.Levels {
                ingestedBytes += lm.BytesIngested
        }
        return ingestedBytes
}

// CompactedBytes returns the sum of bytes read and written during
// compactions across all levels of the LSM.
func (m *Metrics) CompactedBytes() (read, written uint64) {
        for _, lm := range m.Metrics.Levels {
                read += lm.BytesRead
                written += lm.BytesCompacted
        }
        return read, written
}

// AsStoreStatsEvent converts a Metrics struct into an eventpb.StoreStats event,
// suitable for logging to the telemetry channel.
func (m *Metrics) AsStoreStatsEvent() eventpb.StoreStats {
        e := eventpb.StoreStats{
                CacheSize:                  m.BlockCache.Size,
                CacheCount:                 m.BlockCache.Count,
                CacheHits:                  m.BlockCache.Hits,
                CacheMisses:                m.BlockCache.Misses,
                CompactionCountDefault:     m.Compact.DefaultCount,
                CompactionCountDeleteOnly:  m.Compact.DeleteOnlyCount,
                CompactionCountElisionOnly: m.Compact.ElisionOnlyCount,
                CompactionCountMove:        m.Compact.MoveCount,
                CompactionCountRead:        m.Compact.ReadCount,
                CompactionCountRewrite:     m.Compact.RewriteCount,
                CompactionNumInProgress:    m.Compact.NumInProgress,
                CompactionMarkedFiles:      int64(m.Compact.MarkedFiles),
                FlushCount:                 m.Flush.Count,
                FlushIngestCount:           m.Flush.AsIngestCount,
                FlushIngestTableCount:      m.Flush.AsIngestTableCount,
                FlushIngestTableBytes:      m.Flush.AsIngestBytes,
                IngestCount:                m.Ingest.Count,
                MemtableSize:               m.MemTable.Size,
                MemtableCount:              m.MemTable.Count,
                MemtableZombieCount:        m.MemTable.ZombieCount,
                MemtableZombieSize:         m.MemTable.ZombieSize,
                WalLiveCount:               m.WAL.Files,
                WalLiveSize:                m.WAL.Size,
                WalObsoleteCount:           m.WAL.ObsoleteFiles,
                WalObsoleteSize:            m.WAL.ObsoletePhysicalSize,
                WalPhysicalSize:            m.WAL.PhysicalSize,
                WalBytesIn:                 m.WAL.BytesIn,
                WalBytesWritten:            m.WAL.BytesWritten,
                TableObsoleteCount:         m.Table.ObsoleteCount,
                TableObsoleteSize:          m.Table.ObsoleteSize,
                TableZombieCount:           m.Table.ZombieCount,
                TableZombieSize:            m.Table.ZombieSize,
                RangeKeySetsCount:          m.Keys.RangeKeySetsCount,
        }
        for i, l := range m.Levels {
                if l.NumFiles == 0 {
                        continue
                }
                e.Levels = append(e.Levels, eventpb.LevelStats{
                        Level:           uint32(i),
                        NumFiles:        l.NumFiles,
                        SizeBytes:       l.Size,
                        Score:           float32(l.Score),
                        BytesIn:         l.BytesIn,
                        BytesIngested:   l.BytesIngested,
                        BytesMoved:      l.BytesMoved,
                        BytesRead:       l.BytesRead,
                        BytesCompacted:  l.BytesCompacted,
                        BytesFlushed:    l.BytesFlushed,
                        TablesCompacted: l.TablesCompacted,
                        TablesFlushed:   l.TablesFlushed,
                        TablesIngested:  l.TablesIngested,
                        TablesMoved:     l.TablesMoved,
                        NumSublevels:    l.Sublevels,
                })
        }
        return e
}

// GetIntent will look up an intent given a key. It there is no intent for a
// key, it will return nil rather than an error. Errors are returned for problem
// at the storage layer, problem decoding the key, problem unmarshalling the
// intent, missing transaction on the intent, or multiple intents for this key.
func GetIntent(ctx context.Context, reader Reader, key roachpb.Key) (*roachpb.Intent, error) {
        // Probe the lock table at key using a lock-table iterator.
        opts := LockTableIteratorOptions{
                Prefix: true,
                // Ignore Exclusive and Shared locks. We only care about intents.
                MatchMinStr: lock.Intent,
        }
        iter, err := NewLockTableIterator(ctx, reader, opts)
        if err != nil {
                return nil, err
        }
        defer iter.Close()

        seekKey, _ := keys.LockTableSingleKey(key, nil)
        valid, err := iter.SeekEngineKeyGE(EngineKey{Key: seekKey})
        if err != nil {
                return nil, err
        }
        if !valid {
                return nil, nil
        }

        engineKey, err := iter.EngineKey()
        if err != nil {
                return nil, err
        }
        ltKey, err := engineKey.ToLockTableKey()
        if err != nil {
                return nil, err
        }
        if !ltKey.Key.Equal(key) {
                // This should not be possible, a key and using prefix match means that it
                // must match.
                return nil, errors.AssertionFailedf("key does not match expected %v != %v", ltKey.Key, key)
        }
        if ltKey.Strength != lock.Intent {
                return nil, errors.AssertionFailedf("unexpected strength for LockTableKey %s: %v", ltKey.Strength, ltKey)
        }
        var meta enginepb.MVCCMetadata
        if err = iter.ValueProto(&meta); err != nil {
                return nil, err
        }
        if meta.Txn == nil {
                return nil, errors.AssertionFailedf("txn is null for key %v, intent %v", key, meta)
        }
        intent := roachpb.MakeIntent(meta.Txn, key)

        hasNext, err := iter.NextEngineKey()
        if err != nil {
                // We expect false on the call to next, but not an error.
                return nil, err
        }
        // This should not be possible. There can only be one outstanding write
        // intent for a key and with prefix match we don't find additional names.
        if hasNext {
                engineKey, err := iter.EngineKey()
                if err != nil {
                        return nil, err
                }
                return nil, errors.AssertionFailedf("unexpected additional key found %v while looking for %v", engineKey, key)
        }
        return &intent, nil
}

// Scan returns up to max point key/value objects from start (inclusive) to end
// (non-inclusive). Specify max=0 for unbounded scans. Since this code may use
// an intentInterleavingIter, the caller should not attempt a single scan to
// span local and global keys. See the comment in the declaration of
// intentInterleavingIter for details.
//
// NB: This function ignores MVCC range keys. It should only be used for tests.
func Scan(
        ctx context.Context, reader Reader, start, end roachpb.Key, max int64,
) ([]MVCCKeyValue, error) {
        var kvs []MVCCKeyValue
        err := reader.MVCCIterate(ctx, start, end, MVCCKeyAndIntentsIterKind, IterKeyTypePointsOnly,
                fs.UnknownReadCategory,
                func(kv MVCCKeyValue, _ MVCCRangeKeyStack) error {
                        if max != 0 && int64(len(kvs)) >= max {
                                return iterutil.StopIteration()
                        }
                        kvs = append(kvs, kv)
                        return nil
                })
        return kvs, err
}

// ScanLocks scans locks (shared, exclusive, and intent) using only the lock
// table keyspace. It does not scan over the MVCC keyspace.
func ScanLocks(
        ctx context.Context, reader Reader, start, end roachpb.Key, maxLocks, targetBytes int64,
) ([]roachpb.Lock, error) {
        var locks []roachpb.Lock

        if bytes.Compare(start, end) >= 0 {
                return locks, nil
        }

        ltStart, _ := keys.LockTableSingleKey(start, nil)
        ltEnd, _ := keys.LockTableSingleKey(end, nil)
        iter, err := NewLockTableIterator(ctx, reader, LockTableIteratorOptions{
                LowerBound:  ltStart,
                UpperBound:  ltEnd,
                MatchMinStr: lock.Shared, // all locks
        })
        if err != nil {
                return nil, err
        }
        defer iter.Close()

        var meta enginepb.MVCCMetadata
        var lockBytes int64
        var ok bool
        for ok, err = iter.SeekEngineKeyGE(EngineKey{Key: ltStart}); ok; ok, err = iter.NextEngineKey() {
                if err := ctx.Err(); err != nil {
                        return nil, err
                }
                if maxLocks != 0 && int64(len(locks)) >= maxLocks {
                        break
                }
                if targetBytes != 0 && lockBytes >= targetBytes {
                        break
                }
                key, err := iter.EngineKey()
                if err != nil {
                        return nil, err
                }
                ltKey, err := key.ToLockTableKey()
                if err != nil {
                        return nil, err
                }
                v, err := iter.UnsafeValue()
                if err != nil {
                        return nil, err
                }
                if err = protoutil.Unmarshal(v, &meta); err != nil {
                        return nil, err
                }
                locks = append(locks, roachpb.MakeLock(meta.Txn, ltKey.Key, ltKey.Strength))
                lockBytes += int64(len(ltKey.Key)) + int64(len(v))
        }
        if err != nil {
                return nil, err
        }
        return locks, nil
}

// WriteSyncNoop carries out a synchronous no-op write to the engine.
func WriteSyncNoop(eng Engine) error {
        batch := eng.NewBatch()
        defer batch.Close()

        if err := batch.LogData(nil); err != nil {
                return err
        }

        if err := batch.Commit(true /* sync */); err != nil {
                return err
        }
        return nil
}

// ClearRangeWithHeuristic clears the keys from start (inclusive) to end
// (exclusive), including any range keys, but does not clear intents unless the
// lock table is targeted explicitly. Depending on the number of keys, it will
// either write a Pebble range tombstone or clear individual keys. If it uses
// a range tombstone, it will tighten the span to the first encountered key.
//
// pointKeyThreshold and rangeKeyThreshold specify the number of point/range
// keys respectively where it will switch from clearing individual keys to
// Pebble range tombstones (RANGEDEL or RANGEKEYDEL respectively). A threshold
// of 0 disables checking for and clearing that key type.
//
// NB: An initial scan will be done to determine the type of clear, so a large
// threshold will potentially involve scanning a large number of keys twice.
//
// TODO(erikgrinaker): Consider tightening the end of the range tombstone span
// too, by doing a SeekLT when we reach the threshold. It's unclear whether it's
// really worth it.
func ClearRangeWithHeuristic(
        ctx context.Context,
        r Reader,
        w Writer,
        start, end roachpb.Key,
        pointKeyThreshold, rangeKeyThreshold int,
) error {
        clearPointKeys := func(r Reader, w Writer, start, end roachpb.Key, threshold int) error {
                iter, err := r.NewEngineIterator(ctx, IterOptions{
                        KeyTypes:   IterKeyTypePointsOnly,
                        LowerBound: start,
                        UpperBound: end,
                })
                if err != nil {
                        return err
                }
                defer iter.Close()

                // Scan, and drop a RANGEDEL if we reach the threshold. We tighten the span
                // to the first encountered key, since we can cheaply do so.
                var ok bool
                var count int
                var firstKey roachpb.Key
                for ok, err = iter.SeekEngineKeyGE(EngineKey{Key: start}); ok; ok, err = iter.NextEngineKey() {
                        count++
                        if len(firstKey) == 0 {
                                key, err := iter.UnsafeEngineKey()
                                if err != nil {
                                        return err
                                }
                                firstKey = key.Key.Clone()
                        }
                        if count >= threshold {
                                return w.ClearRawRange(firstKey, end, true /* pointKeys */, false /* rangeKeys */)
                        }
                }
                if err != nil || count == 0 {
                        return err
                }
                // Clear individual points.
                for ok, err = iter.SeekEngineKeyGE(EngineKey{Key: start}); ok; ok, err = iter.NextEngineKey() {
                        key, err := iter.UnsafeEngineKey()
                        if err != nil {
                                return err
                        }
                        if err = w.ClearEngineKey(key, ClearOptions{
                                ValueSizeKnown: true,
                                ValueSize:      uint32(iter.ValueLen()),
                        }); err != nil {
                                return err
                        }
                }
                return err
        }

        clearRangeKeys := func(r Reader, w Writer, start, end roachpb.Key, threshold int) error {
                iter, err := r.NewEngineIterator(ctx, IterOptions{
                        KeyTypes:   IterKeyTypeRangesOnly,
                        LowerBound: start,
                        UpperBound: end,
                })
                if err != nil {
                        return err
                }
                defer iter.Close()

                // Scan, and drop a RANGEKEYDEL if we reach the threshold.
                var ok bool
                var count int
                var firstKey roachpb.Key
                for ok, err = iter.SeekEngineKeyGE(EngineKey{Key: start}); ok; ok, err = iter.NextEngineKey() {
                        count += len(iter.EngineRangeKeys())
                        if len(firstKey) == 0 {
                                bounds, err := iter.EngineRangeBounds()
                                if err != nil {
                                        return err
                                }
                                firstKey = bounds.Key.Clone()
                        }
                        if count >= threshold {
                                return w.ClearRawRange(firstKey, end, false /* pointKeys */, true /* rangeKeys */)
                        }
                }
                if err != nil || count == 0 {
                        return err
                }
                // Clear individual range keys.
                for ok, err = iter.SeekEngineKeyGE(EngineKey{Key: start}); ok; ok, err = iter.NextEngineKey() {
                        bounds, err := iter.EngineRangeBounds()
                        if err != nil {
                                return err
                        }
                        for _, v := range iter.EngineRangeKeys() {
                                if err := w.ClearEngineRangeKey(bounds.Key, bounds.EndKey, v.Version); err != nil {
                                        return err
                                }
                        }
                }
                return err
        }

        if pointKeyThreshold > 0 {
                if err := clearPointKeys(r, w, start, end, pointKeyThreshold); err != nil {
                        return err
                }
        }

        if rangeKeyThreshold > 0 {
                if err := clearRangeKeys(r, w, start, end, rangeKeyThreshold); err != nil {
                        return err
                }
        }

        return nil
}

var ingestDelayL0Threshold = settings.RegisterIntSetting(
        settings.ApplicationLevel,
        "rocksdb.ingest_backpressure.l0_file_count_threshold",
        "number of L0 files after which to backpressure SST ingestions",
        20,
)

var ingestDelayTime = settings.RegisterDurationSetting(
        settings.ApplicationLevel,
        "rocksdb.ingest_backpressure.max_delay",
        "maximum amount of time to backpressure a single SST ingestion",
        time.Second*5,
)

var preIngestDelayEnabled = settings.RegisterBoolSetting(
        settings.SystemOnly,
        "pebble.pre_ingest_delay.enabled",
        "controls whether the pre-ingest delay mechanism is active",
        false,
)

// PreIngestDelay may choose to block for some duration if L0 has an excessive
// number of files in it or if PendingCompactionBytesEstimate is elevated. This
// it is intended to be called before ingesting a new SST, since we'd rather
// backpressure the bulk operation adding SSTs than slow down the whole RocksDB
// instance and impact all foreground traffic by adding too many files to it.
// After the number of L0 files exceeds the configured limit, it gradually
// begins delaying more for each additional file in L0 over the limit until
// hitting its configured (via settings) maximum delay. If the pending
// compaction limit is exceeded, it waits for the maximum delay.
func preIngestDelay(ctx context.Context, eng Engine, settings *cluster.Settings) {
        if settings == nil {
                return
        }
        if !preIngestDelayEnabled.Get(&settings.SV) {
                return
        }
        metrics := eng.GetMetrics()
        targetDelay := calculatePreIngestDelay(settings, metrics.Metrics)

        if targetDelay == 0 {
                return
        }
        log.VEventf(ctx, 2, "delaying SST ingestion %s. %d L0 files, %d L0 Sublevels",
                targetDelay, metrics.Levels[0].NumFiles, metrics.Levels[0].Sublevels)

        select {
        case <-time.After(targetDelay):
        case <-ctx.Done():
        }
}

func calculatePreIngestDelay(settings *cluster.Settings, metrics *pebble.Metrics) time.Duration {
        maxDelay := ingestDelayTime.Get(&settings.SV)
        l0ReadAmpLimit := ingestDelayL0Threshold.Get(&settings.SV)

        const ramp = 10
        l0ReadAmp := metrics.Levels[0].NumFiles
        if metrics.Levels[0].Sublevels >= 0 {
                l0ReadAmp = int64(metrics.Levels[0].Sublevels)
        }

        if l0ReadAmp > l0ReadAmpLimit {
                delayPerFile := maxDelay / time.Duration(ramp)
                targetDelay := time.Duration(l0ReadAmp-l0ReadAmpLimit) * delayPerFile
                if targetDelay > maxDelay {
                        return maxDelay
                }
                return targetDelay
        }
        return 0
}

// Helper function to implement Reader.MVCCIterate().
func iterateOnReader(
        ctx context.Context,
        reader Reader,
        start, end roachpb.Key,
        iterKind MVCCIterKind,
        keyTypes IterKeyType,
        readCategory fs.ReadCategory,
        f func(MVCCKeyValue, MVCCRangeKeyStack) error,
) error {
        if reader.Closed() {
                return errors.New("cannot call MVCCIterate on a closed batch")
        }
        if start.Compare(end) >= 0 {
                return nil
        }

        it, err := reader.NewMVCCIterator(ctx, iterKind, IterOptions{
                KeyTypes:     keyTypes,
                LowerBound:   start,
                UpperBound:   end,
                ReadCategory: readCategory,
        })
        if err != nil {
                return err
        }
        defer it.Close()

        var rangeKeys MVCCRangeKeyStack // cached during iteration
        for it.SeekGE(MakeMVCCMetadataKey(start)); ; it.Next() {
                if ok, err := it.Valid(); err != nil {
                        return err
                } else if !ok {
                        break
                }

                var kv MVCCKeyValue
                if hasPoint, _ := it.HasPointAndRange(); hasPoint {
                        v, err := it.Value()
                        if err != nil {
                                return err
                        }
                        kv = MVCCKeyValue{Key: it.UnsafeKey().Clone(), Value: v}
                }
                if !it.RangeBounds().Key.Equal(rangeKeys.Bounds.Key) {
                        rangeKeys = it.RangeKeys().Clone()
                }

                if err := f(kv, rangeKeys); err != nil {
                        return iterutil.Map(err)
                }
        }
        return nil
}

// assertSimpleMVCCIteratorInvariants asserts invariants in the
// SimpleMVCCIterator interface that should hold for all implementations,
// returning errors.AssertionFailedf for any violations. The iterator
// must be valid.
func assertSimpleMVCCIteratorInvariants(iter SimpleMVCCIterator) error {
        key := iter.UnsafeKey()

        // Keys can't be empty.
        if len(key.Key) == 0 {
                return errors.AssertionFailedf("valid iterator returned empty key")
        }

        // Can't be positioned in the lock table.
        if bytes.HasPrefix(key.Key, keys.LocalRangeLockTablePrefix) {
                return errors.AssertionFailedf("MVCC iterator positioned in lock table at %s", key)
        }

        // Any valid position must have either a point and/or range key.
        hasPoint, hasRange := iter.HasPointAndRange()
        if !hasPoint && !hasRange {
                // NB: MVCCIncrementalIterator can return hasPoint=false,hasRange=false
                // following a NextIgnoringTime() call. We explicitly allow this here.
                if incrIter, ok := iter.(*MVCCIncrementalIterator); !ok || !incrIter.ignoringTime {
                        return errors.AssertionFailedf("valid iterator without point/range keys at %s", key)
                }
        }

        // Range key assertions.
        if hasRange {
                // Must have bounds. The MVCCRangeKey.Validate() call below will make
                // further bounds assertions.
                bounds := iter.RangeBounds()
                if len(bounds.Key) == 0 && len(bounds.EndKey) == 0 {
                        return errors.AssertionFailedf("hasRange=true but empty range bounds at %s", key)
                }

                // Iterator position must be within range key bounds.
                if !bounds.ContainsKey(key.Key) {
                        return errors.AssertionFailedf("iterator position %s outside range bounds %s", key, bounds)
                }

                // Bounds must match range key stack.
                rangeKeys := iter.RangeKeys()
                if !rangeKeys.Bounds.Equal(bounds) {
                        return errors.AssertionFailedf("range bounds %s does not match range key %s",
                                bounds, rangeKeys.Bounds)
                }

                // Must have range keys.
                if rangeKeys.IsEmpty() {
                        return errors.AssertionFailedf("hasRange=true but no range key versions at %s", key)
                }

                for i, v := range rangeKeys.Versions {
                        // Range key must be valid.
                        rangeKey := rangeKeys.AsRangeKey(v)
                        if err := rangeKey.Validate(); err != nil {
                                return errors.NewAssertionErrorWithWrappedErrf(err, "invalid range key at %s", key)
                        }
                        // Range keys must be in descending timestamp order.
                        if i > 0 && !v.Timestamp.Less(rangeKeys.Versions[i-1].Timestamp) {
                                return errors.AssertionFailedf("range key %s not below version %s",
                                        rangeKey, rangeKeys.Versions[i-1].Timestamp)
                        }
                        // Range keys must currently be tombstones.
                        if value, err := DecodeMVCCValue(v.Value); err != nil {
                                return errors.NewAssertionErrorWithWrappedErrf(err, "invalid range key value at %s",
                                        rangeKey)
                        } else if !value.IsTombstone() {
                                return errors.AssertionFailedf("non-tombstone range key %s with value %x",
                                        rangeKey, value.Value.RawBytes)
                        }
                }

        }
        if hasPoint {
                value, err := iter.UnsafeValue()
                if err != nil {
                        return err
                }
                valueLen := iter.ValueLen()
                if len(value) != valueLen {
                        return errors.AssertionFailedf("length of UnsafeValue %d != ValueLen %d", len(value), valueLen)
                }
                if key.IsValue() {
                        valueLen2, isTombstone, err := iter.MVCCValueLenAndIsTombstone()
                        if err == nil {
                                if len(value) != valueLen2 {
                                        return errors.AssertionFailedf("length of UnsafeValue %d != MVCCValueLenAndIsTombstone %d",
                                                len(value), valueLen2)
                                }
                                if v, err := DecodeMVCCValue(value); err == nil {
                                        if isTombstone != v.IsTombstone() {
                                                return errors.AssertionFailedf("isTombstone from MVCCValueLenAndIsTombstone %t != MVCCValue.IsTombstone %t",
                                                        isTombstone, v.IsTombstone())
                                        }
                                        // Else err != nil. SimpleMVCCIterator is not responsile for data
                                        // corruption since it is possible that the implementation of
                                        // MVCCValueLenAndIsTombstone is fetching information from a
                                        // different part of the store than where the value is stored.
                                }
                        }
                        // Else err != nil. Ignore, since SimpleMVCCIterator is not to be held
                        // responsible for data corruption or tests writing non-MVCCValues.
                }
        }

        return nil
}

// assertMVCCIteratorInvariants asserts invariants in the MVCCIterator interface
// that should hold for all implementations, returning errors.AssertionFailedf
// for any violations. It calls through to assertSimpleMVCCIteratorInvariants().
// The iterator must be valid.
func assertMVCCIteratorInvariants(iter MVCCIterator) error {
        // Assert SimpleMVCCIterator invariants.
        if err := assertSimpleMVCCIteratorInvariants(iter); err != nil {
                return err
        }

        key := iter.UnsafeKey().Clone()

        // UnsafeRawMVCCKey must match Key.
        if r, err := DecodeMVCCKey(iter.UnsafeRawMVCCKey()); err != nil {
                return errors.NewAssertionErrorWithWrappedErrf(
                        err, "failed to decode UnsafeRawMVCCKey at %s",
                        key,
                )
        } else if !r.Equal(key) {
                return errors.AssertionFailedf("UnsafeRawMVCCKey %s does not match Key %s", r, key)
        }

        // UnsafeRawKey must either be an MVCC key matching Key, or a lock table key
        // that refers to it.
        if engineKey, ok := DecodeEngineKey(iter.UnsafeRawKey()); !ok {
                return errors.AssertionFailedf("failed to decode UnsafeRawKey as engine key at %s", key)
        } else if engineKey.IsMVCCKey() {
                if k, err := engineKey.ToMVCCKey(); err != nil {
                        return errors.NewAssertionErrorWithWrappedErrf(err, "invalid UnsafeRawKey at %s", key)
                } else if !k.Equal(key) {
                        return errors.AssertionFailedf("UnsafeRawKey %s does not match Key %s", k, key)
                }
        } else if engineKey.IsLockTableKey() {
                if k, err := engineKey.ToLockTableKey(); err != nil {
                        return errors.NewAssertionErrorWithWrappedErrf(err, "invalid UnsafeRawKey at %s", key)
                } else if !k.Key.Equal(key.Key) {
                        return errors.AssertionFailedf("UnsafeRawKey lock table key %s does not match Key %s", k, key)
                } else if !key.Timestamp.IsEmpty() {
                        return errors.AssertionFailedf(
                                "UnsafeRawKey lock table key %s for Key %s with non-zero timestamp", k, key,
                        )
                }
        } else {
                return errors.AssertionFailedf("unknown type for engine key %s", engineKey)
        }

        // If the iterator position has a point key, Value must equal UnsafeValue.
        // NB: It's only valid to read an iterator's Value if the iterator is
        // positioned at a point key.
        if hasPoint, _ := iter.HasPointAndRange(); hasPoint {
                u, err := iter.UnsafeValue()
                if err != nil {
                        return err
                }
                v, err := iter.Value()
                if err != nil {
                        return err
                }
                if !bytes.Equal(v, u) {
                        return errors.AssertionFailedf("Value %x does not match UnsafeValue %x at %s", v, u, key)
                }
        }

        // For prefix iterators, any range keys must be point-sized. We've already
        // asserted that the range key covers the iterator position.
        if iter.IsPrefix() {
                if _, hasRange := iter.HasPointAndRange(); hasRange {
                        if bounds := iter.RangeBounds(); !bounds.EndKey.Equal(bounds.Key.Next()) {
                                return errors.AssertionFailedf("prefix iterator with wide range key %s", bounds)
                        }
                }
        }

        return nil
}

// ScanConflictingIntentsForDroppingLatchesEarly scans intents using only the
// separated intents lock table on behalf of a batch request trying to drop its
// latches early. If found, conflicting intents are added to the supplied
// `intents` slice, which indicates to the caller that evaluation should not
// proceed until the intents are resolved. Intents that don't conflict with the
// transaction referenced by txnID[1] at the supplied `ts` are ignored; so are
// {Shared,Exclusive} replicated locks, as they do not conflict with non-locking
// reads.
//
// The `needsIntentHistory` return value indicates whether the caller needs to
// consult intent history when performing a scan over the MVCC keyspace to
// read correct provisional values for at least one of the keys being scanned.
// Typically, this applies to all transactions that read their own writes.
//
// [1] The supplied txnID may be empty (uuid.Nil) if the request on behalf of
// which the scan is being performed is non-transactional.
func ScanConflictingIntentsForDroppingLatchesEarly(
        ctx context.Context,
        reader Reader,
        txnID uuid.UUID,
        ts hlc.Timestamp,
        start, end roachpb.Key,
        intents *[]roachpb.Intent,
        maxLockConflicts int64,
        targetLockConflictBytes int64,
) (needIntentHistory bool, err error) {
        if err := ctx.Err(); err != nil {
                return false, err
        }

        upperBoundUnset := len(end) == 0 // NB: Get requests do not set the end key.
        if !upperBoundUnset && bytes.Compare(start, end) >= 0 {
                return true, errors.AssertionFailedf("start key must be less than end key")
        }
        ltStart, _ := keys.LockTableSingleKey(start, nil)
        opts := LockTableIteratorOptions{
                LowerBound: ltStart,
                // Ignore Exclusive and Shared locks; we only drop latches early for
                // non-locking reads, which do not conflict with Shared or
                // Exclusive[1] locks.
                //
                // [1] Specifically replicated Exclusive locks. Interaction with
                // unreplicated locks is governed by the ExclusiveLocksBlockNonLockingReads
                // cluster setting.
                MatchMinStr:  lock.Intent,
                ReadCategory: fs.BatchEvalReadCategory,
        }
        if upperBoundUnset {
                opts.Prefix = true
        } else {
                ltEnd, _ := keys.LockTableSingleKey(end, nil)
                opts.UpperBound = ltEnd
        }
        iter, err := NewLockTableIterator(ctx, reader, opts)
        if err != nil {
                return false, err
        }
        defer iter.Close()
        if log.ExpensiveLogEnabled(ctx, 3) {
                defer func() {
                        ss := iter.Stats().Stats
                        log.VEventf(ctx, 3, "lock table scan stats: %s", ss.String())
                }()
        }

        var meta enginepb.MVCCMetadata
        var ok bool
        intentSize := int64(0)
        for ok, err = iter.SeekEngineKeyGE(EngineKey{Key: ltStart}); ok; ok, err = iter.NextEngineKey() {
                if maxLockConflicts != 0 && int64(len(*intents)) >= maxLockConflicts {
                        // Return early if we're done accumulating intents; make no claims about
                        // not needing intent history.
                        return true /* needsIntentHistory */, nil
                }
                if targetLockConflictBytes != 0 && intentSize >= targetLockConflictBytes {
                        // Return early if we're exceed intent byte limits; make no claims about
                        // not needing intent history.
                        return true /* needsIntentHistory */, nil
                }
                err := iter.ValueProto(&meta)
                if err != nil {
                        return false, err
                }
                if meta.Txn == nil {
                        return false, errors.Errorf("intent without transaction")
                }
                ownIntent := txnID != uuid.Nil && txnID == meta.Txn.ID
                if ownIntent {
                        // If we ran into one of our own intents, a corresponding scan over the
                        // MVCC keyspace will need access to the key's intent history in order to
                        // read the correct provisional value. As such, we set `needsIntentHistory`
                        // to be true.
                        //
                        // This determination is more restrictive than it needs to be. A read
                        // request needs access to the intent history when performing a scan over
                        // the MVCC keyspace only if:
                        // 1. The request is reading at a lower sequence number than the intent's
                        // sequence number.
                        // 2. OR the request is reading at a (strictly) lower timestamp than the
                        // intent timestamp[1]. This can happen if the intent was pushed for some
                        // reason.
                        // 3. OR the found intent should be ignored because it was written as part
                        // of a savepoint which was subsequently rolled back.
                        // 4. OR the found intent and read request belong to different txn epochs.
                        //
                        // The conditions above mirror special case handling for intents by
                        // pebbleMVCCScanner's getOne method. If we find scanning the lock table
                        // twice (once during conflict resolution, and once when interleaving
                        // intents during the MVCC read) is too expensive for transactions that
                        // read their own writes, there's some optimizations to be had here by
                        // being smarter about when we decide to interleave intents or not to.
                        //
                        // [1] Only relevant if the intent has a sequence number less than or
                        // equal to the read request's sequence number. Otherwise, we need access
                        // to the intent history to read the correct provisional value -- one
                        // written at a lower or equal sequence number compared to the read
                        // request's.
                        needIntentHistory = true
                        continue
                }
                if intentConflicts := meta.Timestamp.ToTimestamp().LessEq(ts); !intentConflicts {
                        continue
                }
                key, err := iter.EngineKey()
                if err != nil {
                        return false, err
                }
                ltKey, err := key.ToLockTableKey()
                if err != nil {
                        return false, err
                }
                if ltKey.Strength != lock.Intent {
                        return false, errors.AssertionFailedf("unexpected strength for LockTableKey %s", ltKey.Strength)
                }
                conflictingIntent := roachpb.MakeIntent(meta.Txn, ltKey.Key)
                intentSize += int64(conflictingIntent.Size())
                *intents = append(*intents, conflictingIntent)
        }
        if err != nil {
                return false, err
        }
        if err := ctx.Err(); err != nil {
                return false, err
        }
        return needIntentHistory, nil /* err */
}

// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "encoding/binary"
        "fmt"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/cockroach/pkg/util/uuid"
        "github.com/cockroachdb/errors"
)

// EngineKey is the general key type that is stored in the engine. It consists
// of a roachpb.Key followed by an optional "version". The term "version" is
// a loose one: often the version is a real version represented as an hlc.Timestamp,
// but it can also be the suffix of a lock table key containing the lock strength
// and txn UUID. These special cases have their own types, MVCCKey and LockTableKey.
// For key kinds that will never have a version, the code has historically used
// MVCCKey, though future code may be better served by using EngineKey (and we
// should consider changing all the legacy code).
//
// The version can have the following lengths in addition to 0 length.
// - Timestamp of MVCC keys: 8, 12, or 13 bytes.
// - Lock table key: 17 bytes.
type EngineKey struct {
        Key     roachpb.Key
        Version []byte
}

// There are multiple decoding functions in the storage package, optimized for
// their particular use case, that demultiplex on the various lengths below.
// If adding another length to this list, remember to search for code
// referencing these lengths and fix it.
// TODO(nvanbenschoten): unify these constants with those in mvcc_key.go.
const (
        engineKeyNoVersion                             = 0
        engineKeyVersionWallTimeLen                    = 8
        engineKeyVersionWallAndLogicalTimeLen          = 12
        engineKeyVersionWallLogicalAndSyntheticTimeLen = 13
        engineKeyVersionLockTableLen                   = 17
)

// Format implements the fmt.Formatter interface
func (k EngineKey) Format(f fmt.State, c rune) {
        fmt.Fprintf(f, "%s/%x", k.Key, k.Version)
}

// Encoding:
// Key + \x00 (sentinel) [+ Version + <byte representing length of Version + 1>]
//
// The motivation for the sentinel is that we configure the underlying storage
// engine (Pebble) with a Split function that can be used for constructing
// Bloom filters over just the Key field. However, the encoded Key must also
// look like an encoded EngineKey. By splitting at Key + \x00, the Key looks
// like an EngineKey with no Version.
const (
        sentinel               = '\x00'
        sentinelLen            = 1
        suffixEncodedLengthLen = 1
)

// Copy makes a copy of the key.
func (k EngineKey) Copy() EngineKey {
        buf := make([]byte, len(k.Key)+len(k.Version))
        copy(buf, k.Key)
        k.Key = buf[:len(k.Key)]
        if len(k.Version) > 0 {
                versionCopy := buf[len(k.Key):]
                copy(versionCopy, k.Version)
                k.Version = versionCopy
        }
        return k
}

// EncodedLen returns the encoded length of k.
func (k EngineKey) EncodedLen() int {
        n := len(k.Key) + suffixEncodedLengthLen
        versionLen := len(k.Version)
        if versionLen > 0 {
                n += sentinelLen + versionLen
        }
        return n
}

// Encode encoded the key.
func (k EngineKey) Encode() []byte {
        encodedLen := k.EncodedLen()
        buf := make([]byte, encodedLen)
        k.encodeToSizedBuf(buf)
        return buf
}

// EncodeToBuf attempts to reuse buf for encoding the key, and if undersized,
// allocates a new buffer.
func (k EngineKey) EncodeToBuf(buf []byte) []byte {
        encodedLen := k.EncodedLen()
        if cap(buf) < encodedLen {
                buf = make([]byte, encodedLen)
        } else {
                buf = buf[:encodedLen]
        }
        k.encodeToSizedBuf(buf)
        return buf
}

func (k EngineKey) encodeToSizedBuf(buf []byte) {
        copy(buf, k.Key)
        pos := len(k.Key)
        // The length of the suffix is the full encoded length (len(buf)) minus the
        // length of the key minus the length of the sentinel. Note that the
        // suffixLen is 0 when Version is empty, and when Version is non-empty, it
        // is len(Version)+1. That is, it includes the length byte at the end.
        suffixLen := len(buf) - pos - 1
        if suffixLen > 0 {
                buf[pos] = 0
                pos += sentinelLen
                copy(buf[pos:], k.Version)
        }
        buf[len(buf)-1] = byte(suffixLen)
}

// IsMVCCKey returns true if the key can be decoded as an MVCCKey.
// This includes the case of an empty timestamp.
func (k EngineKey) IsMVCCKey() bool {
        l := len(k.Version)
        return l == engineKeyNoVersion ||
                l == engineKeyVersionWallTimeLen ||
                l == engineKeyVersionWallAndLogicalTimeLen ||
                l == engineKeyVersionWallLogicalAndSyntheticTimeLen
}

// IsLockTableKey returns true if the key can be decoded as a LockTableKey.
func (k EngineKey) IsLockTableKey() bool {
        return len(k.Version) == engineKeyVersionLockTableLen
}

// ToMVCCKey constructs a MVCCKey from the EngineKey.
func (k EngineKey) ToMVCCKey() (MVCCKey, error) {
        key := MVCCKey{Key: k.Key}
        switch len(k.Version) {
        case engineKeyNoVersion:
                // No-op.
        case engineKeyVersionWallTimeLen:
                key.Timestamp.WallTime = int64(binary.BigEndian.Uint64(k.Version[0:8]))
        case engineKeyVersionWallAndLogicalTimeLen, engineKeyVersionWallLogicalAndSyntheticTimeLen:
                key.Timestamp.WallTime = int64(binary.BigEndian.Uint64(k.Version[0:8]))
                key.Timestamp.Logical = int32(binary.BigEndian.Uint32(k.Version[8:12]))
                // NOTE: byte 13 used to store the timestamp's synthetic bit, but this is no
                // longer consulted and can be ignored during decoding.
        default:
                return MVCCKey{}, errors.Errorf("version is not an encoded timestamp %x", k.Version)
        }
        return key, nil
}

// ToLockTableKey constructs a LockTableKey from the EngineKey.
func (k EngineKey) ToLockTableKey() (LockTableKey, error) {
        lockedKey, err := keys.DecodeLockTableSingleKey(k.Key)
        if err != nil {
                return LockTableKey{}, err
        }
        key := LockTableKey{Key: lockedKey}
        key.Strength, key.TxnUUID, err = k.decodeLockTableKeyVersion()
        if err != nil {
                return LockTableKey{}, err
        }
        return key, nil
}

// decodeLockTableKeyVersion decodes the strength and transaction ID from the
// version of a LockTableKey, without decoding the key.
func (k EngineKey) decodeLockTableKeyVersion() (lock.Strength, uuid.UUID, error) {
        if len(k.Version) != engineKeyVersionLockTableLen {
                return 0, uuid.UUID{}, errors.Errorf("version is not valid for a LockTableKey %x", k.Version)
        }
        str, err := getReplicatedLockStrengthForByte(k.Version[0])
        if err != nil {
                return 0, uuid.UUID{}, err
        }
        txnID := *(*uuid.UUID)(k.Version[1:])
        return str, txnID, nil
}

// Validate checks if the EngineKey is a valid MVCCKey or LockTableKey.
func (k EngineKey) Validate() error {
        if k.IsLockTableKey() {
                return keys.ValidateLockTableSingleKey(k.Key)
        }
        _, errMVCC := k.ToMVCCKey()
        return errMVCC
}

// DecodeEngineKey decodes the given bytes as an EngineKey. If the caller
// already knows that the key is an MVCCKey, the Version returned is the
// encoded timestamp.
func DecodeEngineKey(b []byte) (key EngineKey, ok bool) {
        if len(b) == 0 {
                return EngineKey{}, false
        }
        // Last byte is the version length + 1 when there is a version,
        // else it is 0.
        versionLen := int(b[len(b)-1])
        if versionLen == 1 {
                // The key encodes an empty version, which is not valid.
                return EngineKey{}, false
        }
        // keyPartEnd points to the sentinel byte.
        keyPartEnd := len(b) - 1 - versionLen
        if keyPartEnd < 0 || b[keyPartEnd] != 0x00 {
                return EngineKey{}, false
        }
        // Key excludes the sentinel byte.
        key.Key = b[:keyPartEnd]
        if versionLen > 0 {
                // Version consists of the bytes after the sentinel and before the length.
                key.Version = b[keyPartEnd+1 : len(b)-1]
        }
        return key, true
}

// GetKeyPartFromEngineKey is a specialization of DecodeEngineKey which avoids
// constructing a slice for the version part of the key, since the caller does
// not need it.
func GetKeyPartFromEngineKey(engineKey []byte) (key []byte, ok bool) {
        if len(engineKey) == 0 {
                return nil, false
        }
        // Last byte is the version length + 1 when there is a version,
        // else it is 0.
        versionLen := int(engineKey[len(engineKey)-1])
        // keyPartEnd points to the sentinel byte.
        keyPartEnd := len(engineKey) - 1 - versionLen
        if keyPartEnd < 0 || engineKey[keyPartEnd] != 0x00 {
                return nil, false
        }
        // Key excludes the sentinel byte.
        return engineKey[:keyPartEnd], true
}

// EngineKeyFormatter is a fmt.Formatter for EngineKeys.
type EngineKeyFormatter struct {
        key EngineKey
}

var _ fmt.Formatter = EngineKeyFormatter{}

// Format implements the fmt.Formatter interface.
func (m EngineKeyFormatter) Format(f fmt.State, c rune) {
        m.key.Format(f, c)
}

// LockTableKey is a key representing a lock in the lock table.
type LockTableKey struct {
        Key      roachpb.Key
        Strength lock.Strength
        TxnUUID  uuid.UUID
}

// replicatedLockStrengthToByte is a mapping between lock.Strength and the
// strength byte persisted in a lock table key's encoding. See
// LockTableKey.ToEngineKey().
var replicatedLockStrengthToByte = [...]byte{
        lock.Shared:    1,
        lock.Exclusive: 2,
        lock.Intent:    3,
}

// byteToReplicatedLockStrength is a mapping between the strength byte persisted
// in a lock table key's encoding and the lock.Strength of the lock it
// corresponds to. Also see EngineKey.ToLockTableKey().
var byteToReplicatedLockStrength = func() (arr []lock.Strength) {
        maxByte := byte(0)
        for _, b := range replicatedLockStrengthToByte {
                if b > maxByte {
                        maxByte = b
                }
        }
        arr = make([]lock.Strength, maxByte+1)
        for str, b := range replicatedLockStrengthToByte {
                if b != 0 {
                        arr[b] = lock.Strength(str)
                }
        }
        return arr
}()

// getByteForReplicatedLockStrength returns a strength byte, suitable for use in
// a lock's key encoding, given its lock strength.
func getByteForReplicatedLockStrength(str lock.Strength) byte {
        if str < 0 || int(str) >= len(replicatedLockStrengthToByte) {
                panic(errors.AssertionFailedf("unexpected lock strength: %s", str))
        }
        b := replicatedLockStrengthToByte[str]
        if b == 0 {
                panic(errors.AssertionFailedf("unexpected lock strength: %s", str))
        }
        return b
}

// getReplicatedLockStrengthForByte returns a replicated lock's strength given
// the strength byte from its key encoding.
func getReplicatedLockStrengthForByte(b byte) (lock.Strength, error) {
        if int(b) >= len(byteToReplicatedLockStrength) { // byte cannot be < 0
                return lock.None, errors.AssertionFailedf("unexpected lock strength byte: %d", b)
        }
        str := byteToReplicatedLockStrength[b]
        if str == 0 {
                return lock.None, errors.AssertionFailedf("unexpected lock strength byte: %d", b)
        }
        return str, nil
}

// mustGetReplicatedLockStrengthForByte is like mustGetReplicatedLockStrength
// except it panics if there is an error.
func mustGetReplicatedLockStrengthForByte(b byte) lock.Strength {
        str, err := getReplicatedLockStrengthForByte(b)
        if err != nil {
                panic(err)
        }
        return str
}

// ToEngineKey converts a lock table key to an EngineKey. buf is used as
// scratch-space to avoid allocations -- its contents will be overwritten and
// not appended to.
func (lk LockTableKey) ToEngineKey(buf []byte) (EngineKey, []byte) {
        // The first term in estimatedLen is for LockTableSingleKey.
        estimatedLen :=
                (len(keys.LocalRangeLockTablePrefix) + len(keys.LockTableSingleKeyInfix) + len(lk.Key) + 3) +
                        engineKeyVersionLockTableLen
        if cap(buf) < estimatedLen {
                buf = make([]byte, 0, estimatedLen)
        }
        ltKey, buf := keys.LockTableSingleKey(lk.Key, buf)
        k := EngineKey{Key: ltKey}
        if cap(buf)-len(buf) >= engineKeyVersionLockTableLen {
                k.Version = buf[len(buf) : len(buf)+engineKeyVersionLockTableLen]
        } else {
                // estimatedLen was an underestimate.
                k.Version = make([]byte, engineKeyVersionLockTableLen)
        }
        k.Version[0] = getByteForReplicatedLockStrength(lk.Strength)
        copy(k.Version[1:], lk.TxnUUID[:])
        return k, buf
}

// EncodedSize returns the size of the LockTableKey when encoded.
func (lk LockTableKey) EncodedSize() int64 {
        return int64(len(lk.Key)) + engineKeyVersionLockTableLen
}

// EngineRangeKeyValue is a raw value for a general range key as stored in the
// engine. It consists of a version (suffix) and corresponding value. The range
// key bounds are not included, but are surfaced via EngineRangeBounds().
type EngineRangeKeyValue struct {
        Version []byte
        Value   []byte
}

// Verify ensures the checksum of the current batch entry matches the data.
// Returns an error on checksum mismatch.
func (key *EngineKey) Verify(value []byte) error {
        if key.IsMVCCKey() {
                mvccKey, err := key.ToMVCCKey()
                if err != nil {
                        return err
                }
                if mvccKey.IsValue() {
                        return decodeMVCCValueAndVerify(mvccKey.Key, value)
                } else {
                        return decodeMVCCMetaAndVerify(mvccKey.Key, value)
                }
        } else if key.IsLockTableKey() {
                lockTableKey, err := key.ToLockTableKey()
                if err != nil {
                        return err
                }
                return decodeMVCCMetaAndVerify(lockTableKey.Key, value)
        }
        return decodeMVCCMetaAndVerify(key.Key, value)
}

// decodeMVCCValueAndVerify will try to decode the value as
// MVCCValue and then verify the checksum.
func decodeMVCCValueAndVerify(key roachpb.Key, value []byte) error {
        mvccValue, err := decodeMVCCValueIgnoringHeader(value)
        if err != nil {
                return err
        }
        return mvccValue.Value.Verify(key)
}

// decodeMVCCMetaAndVerify will try to decode the value as
// enginepb.MVCCMetadata and then try to  convert the rawbytes
// as MVCCValue then verify the checksum.
func decodeMVCCMetaAndVerify(key roachpb.Key, value []byte) error {
        // TODO(lyang24): refactor to avoid allocation for MVCCMetadata
        // per each call.
        var meta enginepb.MVCCMetadata
        // Time series data might fail the decoding i.e.
        // key 61
        // value 0262000917bba16e0aea5ca80900
        // N.B. we skip checksum checking in this case.
        // nolint:returnerrcheck
        if err := protoutil.Unmarshal(value, &meta); err != nil {
                return nil
        }
        return decodeMVCCValueAndVerify(key, meta.RawBytes)
}

// EngineKeyRange is a key range composed of EngineKeys.
type EngineKeyRange struct {
        Start, End EngineKey
}

// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "math"
        "testing"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/uuid"
)

func encodeLockTableKey(ltk LockTableKey) []byte {
        ek, _ := ltk.ToEngineKey(nil)
        return ek.Encode()
}

// interestingEngineKeys is a slice of byte slices that may be used in tests as
// engine keys. Not all the keys are valid keys.
var interestingEngineKeys = [][]byte{
        {0x00, 0x00},
        {0x01, 0x11, 0x01},
        EncodeMVCCKey(MVCCKey{Key: roachpb.Key("a"), Timestamp: hlc.Timestamp{WallTime: math.MaxInt64}}),
        EncodeMVCCKey(MVCCKey{Key: roachpb.Key("foo"), Timestamp: hlc.Timestamp{WallTime: 1691183078362053000}}),
        EncodeMVCCKey(MVCCKey{Key: roachpb.Key("bar")}),
        EncodeMVCCKey(MVCCKey{Key: roachpb.Key("bar"), Timestamp: hlc.Timestamp{WallTime: 1643550788737652545}}),
        EncodeMVCCKey(MVCCKey{Key: roachpb.Key("bar"), Timestamp: hlc.Timestamp{WallTime: 1643550788737652545, Logical: 1}}),
        encodeLockTableKey(LockTableKey{
                Key:      roachpb.Key("foo"),
                Strength: lock.Exclusive,
                TxnUUID:  uuid.Must(uuid.FromString("6ba7b810-9dad-11d1-80b4-00c04fd430c8")),
        }),
        encodeLockTableKey(LockTableKey{
                Key:      keys.RangeDescriptorKey(roachpb.RKey("baz")),
                Strength: lock.Exclusive,
                TxnUUID:  uuid.Must(uuid.FromString("6ba7b810-9dad-11d1-80b4-00c04fd430c8")),
        }),
}

// FuzzEngineKeysInvariants fuzz tests various functions over engine keys,
// ensuring that invariants over engine keys hold.
func FuzzEngineKeysInvariants(f *testing.F) {
        for i := 0; i < len(interestingEngineKeys); i++ {
                for j := 0; j < len(interestingEngineKeys); j++ {
                        f.Add(interestingEngineKeys[i], interestingEngineKeys[j])
                }
        }

        compareEngineKeys := func(t *testing.T, a, b []byte) int {
                cmp := EngineComparer.Compare(a, b)
                eq := EngineComparer.Equal(a, b)
                // Invariant: Iff EngineKeyCompare(a, b) == 0, EngineKeyEqual(a, b)
                if eq != (cmp == 0) {
                        t.Errorf("EngineKeyEqual(0x%x, 0x%x) = %t; EngineKeyCompare(0x%x, 0x%x) = %d",
                                a, b, eq, a, b, cmp)
                }
                return cmp
        }
        computeImmediateSuccessor := func(t *testing.T, a []byte) []byte {
                succ := EngineComparer.ImmediateSuccessor(nil, a)
                // Invariant: ImmediateSuccessor(a) > a
                if cmp := compareEngineKeys(t, a, succ); cmp >= 0 {
                        t.Errorf("ImmediateSuccessor(0x%x) = 0x%x, but EngineKeyCompare(0x%x, 0x%x) = %d",
                                a, succ, a, succ, cmp)
                }
                return succ
        }
        decodeEngineKey := func(t *testing.T, a []byte) (EngineKey, bool) {
                // Invariant: DecodeEngineKey(a) ok iff GetKeyPartFromEngineKey(a) ok
                // Invariant: DecodeEngineKey(a).Key == GetKeyPartFromEngineKey(a)
                ek, ok1 := DecodeEngineKey(a)
                kp, ok2 := GetKeyPartFromEngineKey(a)
                if ok1 != ok2 || ok1 && !bytes.Equal(ek.Key, kp) {
                        t.Errorf("DecodeEngineKey(0x%x) = (%s, %t); but GetKeyPartFromEngineKey(0x%x) = (0x%x, %t)",
                                a, ek, ok1, a, kp, ok2)
                }

                return ek, ok1
        }

        f.Fuzz(func(t *testing.T, a []byte, b []byte) {
                t.Logf("a = 0x%x; b = 0x%x", a, b)
                // We can only pass valid keys to the comparer.
                ekA, okA := decodeEngineKey(t, a)
                ekB, okB := decodeEngineKey(t, b)
                if !okA || !okB {
                        return
                }
                errA := ekA.Validate()
                errB := ekB.Validate()
                if errA != nil || errB != nil {
                        return
                }
                cmp := compareEngineKeys(t, a, b)
                if cmp == 0 {
                        return
                }
                if len(a) == 0 || len(b) == 0 {
                        return
                }

                // Make a < b.
                if cmp > 0 {
                        a, b = b, a
                        t.Logf("Swapped: a = 0x%x; b = 0x%x", a, b)
                }
                // Invariant: Separator(a, b) >= a
                // Invariant: Separator(a, b) < b
                sep := EngineComparer.Separator(nil, a, b)
                if cmp = compareEngineKeys(t, a, b); cmp > 0 {
                        t.Errorf("Separator(0x%x, 0x%x) = 0x%x; but EngineKeyCompare(0x%x, 0x%x) = %d",
                                a, b, sep, a, sep, cmp)
                }
                if cmp = compareEngineKeys(t, sep, b); cmp >= 0 {
                        t.Errorf("Separator(0x%x, 0x%x) = 0x%x; but EngineKeyCompare(0x%x, 0x%x) = %d",
                                a, b, sep, sep, b, cmp)
                }
                t.Logf("ekA = %s (Key: 0x%x, Version: 0x%x); ekB = %s (Key: 0x%x, Version: 0x%x)",
                        ekA, ekA.Key, ekA.Version, ekB, ekB.Key, ekB.Version)

                splitA := EngineComparer.Split(a)
                splitB := EngineComparer.Split(b)
                aIsSuffixless := splitA == len(a)
                bIsSuffixless := splitB == len(b)
                // ImmediateSuccessor is only defined on prefix keys.
                var immediateSuccessorA, immediateSuccessorB []byte
                if aIsSuffixless {
                        immediateSuccessorA = computeImmediateSuccessor(t, a)
                }
                if bIsSuffixless {
                        immediateSuccessorB = computeImmediateSuccessor(t, b)
                }
                if aIsSuffixless && bIsSuffixless {
                        // Invariant: ImmediateSuccessor(a) < ImmediateSuccessor(b)
                        if cmp = compareEngineKeys(t, immediateSuccessorA, immediateSuccessorB); cmp >= 0 {
                                t.Errorf("ImmediateSuccessor(0x%x) = 0x%x, ImmediateSuccessor(0x%x) = 0x%x; but EngineKeyCompare(0x%x, 0x%x) = %d",
                                        a, immediateSuccessorA, b, immediateSuccessorB, immediateSuccessorA, immediateSuccessorB, cmp)
                        }
                }
        })
}

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"
        "hash"
        "hash/fnv"
        "io"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/tracing"
        "github.com/cockroachdb/errors"
)

// fingerprintWriter hashes every key/timestamp and value for point keys, and
// combines their hashes via a XOR into a running aggregate.
//
// Range keys are not fingerprinted but instead written to a pebble SST that is
// returned to the caller. This is because range keys do not have a stable,
// discrete identity and so it is up to the caller to define a deterministic
// fingerprinting scheme across all returned range keys.
//
// The caller must Finish() and Close() the fingerprintWriter to finalize the
// writes to the underlying pebble SST.
type fingerprintWriter struct {
        hasher       hash.Hash64
        timestampBuf []byte
        options      MVCCExportFingerprintOptions

        sstWriter *SSTWriter
        xorAgg    *uintXorAggregate
}

// makeFingerprintWriter creates a new fingerprintWriter.
func makeFingerprintWriter(
        ctx context.Context,
        hasher hash.Hash64,
        cs *cluster.Settings,
        f io.Writer,
        opts MVCCExportFingerprintOptions,
) fingerprintWriter {
        // TODO(adityamaru,dt): Once
        // https://github.com/cockroachdb/cockroach/issues/90450 has been addressed we
        // should write to a kvBuf instead of a Backup SST writer.
        sstWriter := MakeTransportSSTWriter(ctx, cs, f)
        return fingerprintWriter{
                sstWriter: &sstWriter,
                hasher:    hasher,
                xorAgg:    &uintXorAggregate{},
                options:   opts,
        }
}

type uintXorAggregate struct {
        sum uint64
}

// add inserts one value into the running xor.
func (a *uintXorAggregate) add(x uint64) {
        a.sum = a.sum ^ x
}

// result returns the xor.
func (a *uintXorAggregate) result() uint64 {
        return a.sum
}

// Finish finalizes the underlying SSTWriter, and returns the aggregated
// fingerprint for point keys.
func (f *fingerprintWriter) Finish() (uint64, error) {
        // If no records were added to the sstable, skip completing it.
        if f.sstWriter.DataSize != 0 {
                if err := f.sstWriter.Finish(); err != nil {
                        return 0, err
                }
        }
        return f.xorAgg.result(), nil
}

// Close finishes and frees memory and other resources. Close is idempotent.
func (f *fingerprintWriter) Close() {
        if f.sstWriter == nil {
                return
        }
        f.sstWriter.Close()
        f.hasher.Reset()
        f.xorAgg = nil
        f.sstWriter = nil
}

var _ ExportWriter = &fingerprintWriter{}

// PutRawMVCCRangeKey implements the Writer interface.
func (f *fingerprintWriter) PutRawMVCCRangeKey(key MVCCRangeKey, bytes []byte) error {
        // We do not fingerprint range keys, instead, we write them to a Pebble SST.
        // This is because range keys do not have a stable, discrete identity and so
        // it is up to the caller to define a deterministic fingerprinting scheme
        // across all returned range keys.
        return f.sstWriter.PutRawMVCCRangeKey(key, bytes)
}

// PutRawMVCC implements the Writer interface.
func (f *fingerprintWriter) PutRawMVCC(key MVCCKey, value []byte) error {
        defer f.hasher.Reset()
        // Hash the key/timestamp and value of the RawMVCC.
        err, skip := f.hashKey(key.Key)
        if err != nil {
                return err
        }
        if skip {
                return nil
        }
        if err := f.hashTimestamp(key.Timestamp); err != nil {
                return err
        }
        if err := f.hashValue(value); err != nil {
                return err
        }
        f.xorAgg.add(f.hasher.Sum64())
        return nil
}

// PutUnversioned implements the Writer interface.
func (f *fingerprintWriter) PutUnversioned(key roachpb.Key, value []byte) error {
        defer f.hasher.Reset()

        // Hash the key and value in the absence of a timestamp.
        err, skip := f.hashKey(key)
        if err != nil {
                return err
        }
        if skip {
                return nil
        }

        if err := f.hashValue(value); err != nil {
                return err
        }

        f.xorAgg.add(f.hasher.Sum64())
        return nil
}

func (f *fingerprintWriter) hashKey(key []byte) (error, bool) {
        noTenantPrefix, err := keys.StripTenantPrefix(key)
        if err != nil {
                return err, false
        }
        // Fingerprinting ignores rows from a few special-cased key ranges, namely for
        // the tables that contain ephemeral cluster-topology/state information, which
        // if expected to differ between two clusters that otherwise contain the same
        // data.
        _, tID, _, _ := keys.DecodeTableIDIndexID(noTenantPrefix)
        if tID == keys.SqllivenessID || tID == keys.LeaseTableID || tID == keys.SQLInstancesTableID {
                return nil, true
        }

        if f.options.StripIndexPrefixAndTimestamp {
                return f.hash(f.stripIndexPrefix(key)), false
        }
        if f.options.StripTenantPrefix {
                return f.hash(noTenantPrefix), false
        }
        return f.hash(key), false
}

func (f *fingerprintWriter) hashTimestamp(timestamp hlc.Timestamp) error {
        if f.options.StripIndexPrefixAndTimestamp {
                return nil
        }
        f.timestampBuf = EncodeMVCCTimestampToBuf(f.timestampBuf, timestamp)
        if err := f.hash(f.timestampBuf); err != nil {
                return err
        }
        return nil
}

func (f *fingerprintWriter) hashValue(value []byte) error {
        if f.options.StripValueChecksum {
                return f.hash(f.stripValueChecksum(value))
        }
        return f.hash(value)
}

func (f *fingerprintWriter) hash(data []byte) error {
        if _, err := f.hasher.Write(data); err != nil {
                return errors.NewAssertionErrorWithWrappedErrf(err,
                        `"It never returns an error." -- https://golang.org/pkg/hash: %T`, f)
        }

        return nil
}

func (f *fingerprintWriter) stripValueChecksum(value []byte) []byte {
        if len(value) < mvccChecksumSize {
                return value
        }
        return value[mvccChecksumSize:]
}

func (f *fingerprintWriter) stripIndexPrefix(key []byte) []byte {
        remainder, err := keys.StripIndexPrefix(key)
        if err != nil {
                return key
        }
        return remainder
}

// FingerprintRangekeys iterates over the provided SSTs, that are expected to
// contain only rangekeys, and maintains a XOR aggregate of each rangekey's
// fingerprint.
func FingerprintRangekeys(
        ctx context.Context, cs *cluster.Settings, opts MVCCExportFingerprintOptions, ssts [][]byte,
) (uint64, error) {
        ctx, sp := tracing.ChildSpan(ctx, "storage.FingerprintRangekeys")
        defer sp.Finish()

        if len(ssts) == 0 {
                return 0, nil
        }

        // Assert that the SSTs do not contain any point keys.
        //
        // NB: Combined point/range key iteration is usually a fair bit more expensive
        // than iterating over them separately.
        pointKeyIterOpts := IterOptions{
                KeyTypes:   IterKeyTypePointsOnly,
                UpperBound: keys.MaxKey,
        }
        pointKeyIter, err := NewMultiMemSSTIterator(ssts, false /* verify */, pointKeyIterOpts)
        if err != nil {
                return 0, err
        }
        defer pointKeyIter.Close()
        for pointKeyIter.SeekGE(NilKey); ; pointKeyIter.Next() {
                if valid, err := pointKeyIter.Valid(); !valid || err != nil {
                        if err != nil {
                                return 0, err
                        }
                        break
                }
                hasPoint, _ := pointKeyIter.HasPointAndRange()
                if hasPoint {
                        return 0, errors.AssertionFailedf("unexpected point key; ssts should only contain range keys")
                }
        }

        rangeKeyIterOpts := IterOptions{
                KeyTypes:   IterKeyTypeRangesOnly,
                LowerBound: keys.MinKey,
                UpperBound: keys.MaxKey,
        }
        var fingerprint uint64
        iter, err := NewMultiMemSSTIterator(ssts, true /* verify */, rangeKeyIterOpts)
        if err != nil {
                return fingerprint, err
        }
        defer iter.Close()

        var destFile bytes.Buffer
        fw := makeFingerprintWriter(ctx, fnv.New64(), cs, &destFile, opts)
        defer fw.Close()
        fingerprintRangeKey := func(stack MVCCRangeKeyStack) (uint64, error) {
                defer fw.hasher.Reset()
                err, skip := fw.hashKey(stack.Bounds.Key)
                if err != nil {
                        return 0, err
                }
                if skip {
                        return 0, nil
                }
                err, skip = fw.hashKey(stack.Bounds.EndKey)
                if err != nil {
                        return 0, err
                }
                if skip {
                        return 0, nil
                }
                for _, v := range stack.Versions {
                        if err := fw.hashTimestamp(v.Timestamp); err != nil {
                                return 0, err
                        }
                        mvccValue, err := decodeMVCCValueIgnoringHeader(v.Value)
                        if err != nil {
                                return 0, errors.Wrapf(err, "decoding mvcc value %s", v.Value)
                        }
                        if err := fw.hashValue(mvccValue.Value.RawBytes); err != nil {
                                return 0, err
                        }
                }
                return fw.hasher.Sum64(), nil
        }

        for iter.SeekGE(MVCCKey{Key: keys.MinKey}); ; iter.Next() {
                if ok, err := iter.Valid(); err != nil {
                        return fingerprint, err
                } else if !ok {
                        break
                }
                hasPoint, _ := iter.HasPointAndRange()
                if hasPoint {
                        return fingerprint, errors.AssertionFailedf("unexpected point key; ssts should only contain range keys")
                }
                rangekeyFingerprint, err := fingerprintRangeKey(iter.RangeKeys())
                if err != nil {
                        return fingerprint, err
                }
                fw.xorAgg.add(rangekeyFingerprint)
        }

        if destFile.Len() != 0 {
                return 0, errors.AssertionFailedf("unexpected data found in destFile")
        }

        return fw.Finish()
}

// Copyright 2015 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"

        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
)

// The ForTesting functions randomize the settings for separated intents. This
// is a bit peculiar for tests outside the storage package, since they usually
// have higher level cluster constructs, including creating a cluster.Settings
// as part of the StoreConfig. We are ignoring what may be produced there, and
// injecting a different cluster.Settings here. Plumbing that through for all
// the different higher level testing constructs seems painful, and the only
// places that actively change their behavior for separated intents will use
// the cluster.Settings we inject here, which is used for no other purpose
// other than configuring separated intents. So the fact that we have two
// inconsistent cluster.Settings is harmless.

// NewDefaultInMemForTesting allocates and returns a new, opened in-memory
// engine with the default configuration. The caller must call the engine's
// Close method when the engine is no longer needed. This method randomizes
// whether separated intents are written.
func NewDefaultInMemForTesting(opts ...ConfigOption) Engine {
        eng, err := Open(
                context.Background(), InMemory(), cluster.MakeTestingClusterSettings(),
                ForTesting, MaxSizeBytes(1<<20), CombineOptions(opts...),
        )
        if err != nil {
                panic(err)
        }
        return eng
}

// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"
        "fmt"
        "math/rand"
        "sync"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/util"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
)

// wrappableReader is used to implement a wrapped Reader. A wrapped Reader
// should be used and immediately discarded. It maintains no state of its own
// between calls.
// Why do we not keep the wrapped reader as a member in the caller? Because
// different methods on Reader can need different wrappings depending on what
// they want to observe.
//
// TODO(sumeer): for allocation optimization we could expose a scratch space
// struct that the caller keeps on behalf of the wrapped reader. But can only
// do such an optimization when know that the wrappableReader will be used
// with external synchronization that prevents preallocated buffers from being
// modified concurrently. pebbleBatch.{MVCCGet,MVCCGetProto} have MVCCKey
// serialization allocation optimizations which we can't do below. But those
// are probably not performance sensitive, since the performance sensitive
// code probably uses an MVCCIterator.
type wrappableReader interface {
        Reader
}

// wrapReader wraps the provided reader, to return an implementation of MVCCIterator
// that supports MVCCKeyAndIntentsIterKind.
func wrapReader(r wrappableReader) *intentInterleavingReader {
        iiReader := intentInterleavingReaderPool.Get().(*intentInterleavingReader)
        *iiReader = intentInterleavingReader{wrappableReader: r}
        return iiReader
}

type intentInterleavingReader struct {
        wrappableReader
}

var _ Reader = &intentInterleavingReader{}

var intentInterleavingReaderPool = sync.Pool{
        New: func() interface{} {
                return &intentInterleavingReader{}
        },
}

// NewMVCCIterator implements the Reader interface. The
// intentInterleavingReader can be freed once this method returns.
func (imr *intentInterleavingReader) NewMVCCIterator(
        ctx context.Context, iterKind MVCCIterKind, opts IterOptions,
) (MVCCIterator, error) {
        if (!opts.MinTimestamp.IsEmpty() || !opts.MaxTimestamp.IsEmpty()) &&
                iterKind == MVCCKeyAndIntentsIterKind {
                panic("cannot ask for interleaved intents when specifying timestamp hints")
        }
        if iterKind == MVCCKeyIterKind || opts.KeyTypes == IterKeyTypeRangesOnly {
                return imr.wrappableReader.NewMVCCIterator(ctx, MVCCKeyIterKind, opts)
        }
        return newIntentInterleavingIterator(ctx, imr.wrappableReader, opts)
}

func (imr *intentInterleavingReader) Free() {
        *imr = intentInterleavingReader{}
        intentInterleavingReaderPool.Put(imr)
}

type intentInterleavingIterConstraint int8

const (
        notConstrained intentInterleavingIterConstraint = iota
        constrainedToLocal
        constrainedToGlobal
)

// intentInterleavingIter makes separated intents appear as interleaved. It
// relies on the following assumptions:
//   - There can be no physically interleaved intents, i.e., all intents are
//     separated (in the lock table keyspace).
//   - An intent will have a corresponding provisional value.
//   - The only single key locks in the lock table key space are intents.
//
// Semantically, the functionality is equivalent to merging two MVCCIterators:
//   - A MVCCIterator on the MVCC key space.
//   - A MVCCIterator constructed by wrapping an EngineIterator on the lock table
//     key space where the EngineKey is transformed into the corresponding
//     intent key and appears as MVCCKey{Key: intentKey}.
//
// The implementation below is specialized to reduce unnecessary comparisons
// and iteration, by utilizing the aforementioned assumptions. The intentIter
// iterates over the lock table key space and iter over the MVCC key space.
// They are kept synchronized in the following way (for forward iteration):
//   - At the same MVCCKey.Key: the intentIter is at the intent and iter at the
//     provisional value.
//   - At different MVCCKey.Keys: the intentIter is ahead of iter, at the first
//     key after iter's MVCCKey.Key that has an intent.
//
// Note that in both cases the iterators are apart by the minimal possible
// distance. This minimal distance rule applies for reverse iteration too, and
// can be used to construct similar invariants.
// The one exception to the minimal distance rule is a sub-case of prefix
// iteration, when we know that no separated intents need to be seen, and so
// don't bother positioning intentIter.
//
// The implementation of intentInterleavingIter assumes callers iterating
// forward (reverse) are setting an upper (lower) bound. There is protection
// for misbehavior by the callers that don't set such bounds, by manufacturing
// bounds. These manufactured bounds prevent the lock table iterator from
// leaving the lock table key space. We also need to manufacture bounds for
// the MVCCIterator to prevent it from iterating into the lock table. Note
// that any manufactured bounds for both the lock table iterator and
// MVCCIterator must be consistent since the intentInterleavingIter does not
// like to see a lock with no corresponding provisional value (it will
// consider than an error). Manufacturing of bounds is complicated by the fact
// that the MVCC key space is split into two spans: local keys preceding the
// lock table key space, and global keys. To manufacture a bound, we need to
// know whether the caller plans to iterate over local or global keys. Setting
// aside prefix iteration, which doesn't need any of these manufactured
// bounds, the call to newIntentInterleavingIter must have specified at least
// one of the lower or upper bound. We use that to "constrain" the iterator as
// either a local key iterator or global key iterator and panic if a caller
// violates that in a subsequent SeekGE/SeekLT call.
//
// intentInterleavingIter ignores locks in the lock table keyspace with
// strengths other than lock.Intent (i.e. shared and exclusive locks). Future
// versions of the iterator may expose information to users about whether any
// non-intent locks were observed and, if so, which keys they were found on. For
// now, no such information is exposed.
type intentInterleavingIter struct {
        prefix     bool
        constraint intentInterleavingIterConstraint

        // iter is for iterating over MVCC keys.
        iter *pebbleIterator // MVCCIterator
        // The valid value from iter.Valid() after the last positioning call.
        iterValid bool
        // When iterValid = true, this contains the result of iter.UnsafeKey(). We
        // store it here to avoid repeatedly calling UnsafeKey() since it repeats
        // key parsing.
        iterKey MVCCKey

        // intentIter is for iterating over the lock table keyspace and finding
        // intents, so that intentInterleavingIter can make them look as if they
        // were interleaved.
        intentIter      *LockTableIterator // EngineIterator
        intentIterState pebble.IterValidityState
        // The decoded key from the lock table. This is an unsafe key
        // in that it is only valid when intentIter has not been
        // repositioned. It is nil if the intentIter is considered to be
        // exhausted. Note that the intentIter may still be positioned
        // at a valid position in the case of prefix iteration, but the
        // state of the intentKey overrides that state.
        intentKey                            roachpb.Key
        intentKeyAsNoTimestampMVCCKey        []byte
        intentKeyAsNoTimestampMVCCKeyBacking []byte

        // - cmp output of (intentKey, current iter key) when both are valid.
        //   This does not take timestamps into consideration. So if intentIter
        //   is at an intent, and iter is at the corresponding provisional value,
        //   cmp will be 0. See the longer struct-level comment for more on the
        //   relative positioning of intentIter and iter.
        // - intentKey==nil, iterValid==true, cmp=dir
        //   (i.e., the nil key is akin to infinity in the forward direction
        //   and -infinity in the reverse direction, since that iterator is
        //   exhausted).
        // - intentKey!=nil, iterValid=false, cmp=-dir.
        // - If both are invalid. cmp is undefined and valid=false.
        intentCmp int
        // When intentCmp == 0, this will be set to indicate whether iter is on an
        // unversioned position on a bare range key copositioned with the intent.
        // This will never happen in the forward direction due to
        // maybeSkipIntentRangeKey(). In the reverse direction, if an intent is
        // located on the start key of an overlapping range key, then we cannot step
        // iter past the range key to satisfy the usual intentCmp > 0 condition,
        // because we need the range keys to be exposed via e.g. RangeKeys(). We
        // therefore also have to consider isCurAtIntentIter to be true when iter is
        // positioned on a bare unversioned range key colocated with an intent,
        // i.e. i.dir < 0 && i.intentCmp == 0 && i.iterBareRangeAtIntent.
        //
        // NB: This value is not valid for intentCmp != 0.
        iterBareRangeAtIntent bool
        // rangeKeyChanged keeps track of RangeKeyChanged() for the current
        // iterator position. This can't simply call through to the parent
        // iterator for two reasons:
        //
        // - maybeSkipIntentRangeKey() may step the iterator forward from
        //   a bare range key onto a provisional value, which would cause
        //   RangeKeyChanged() to return false rather than true.
        //
        // - reverse iteration may prematurely move onto a range key when
        //   positioned on an intent not overlapping the range key.
        rangeKeyChanged bool
        // The current direction. +1 for forward, -1 for reverse.
        dir   int
        valid bool
        err   error

        // Buffers to reuse memory when constructing lock table keys for bounds and
        // seeks.
        intentKeyBuf      []byte
        intentLimitKeyBuf []byte
}

var _ MVCCIterator = &intentInterleavingIter{}

var intentInterleavingIterPool = sync.Pool{
        New: func() interface{} {
                return &intentInterleavingIter{}
        },
}

func isLocal(k roachpb.Key) bool {
        return k.Compare(keys.LocalMax) < 0
}

func newIntentInterleavingIterator(
        ctx context.Context, reader Reader, opts IterOptions,
) (MVCCIterator, error) {
        if !opts.MinTimestamp.IsEmpty() || !opts.MaxTimestamp.IsEmpty() {
                panic("intentInterleavingIter must not be used with timestamp hints")
        }
        var lowerIsLocal, upperIsLocal bool
        var constraint intentInterleavingIterConstraint
        if opts.LowerBound != nil {
                lowerIsLocal = isLocal(opts.LowerBound)
                if lowerIsLocal {
                        constraint = constrainedToLocal
                } else {
                        constraint = constrainedToGlobal
                }
        }
        if opts.UpperBound != nil {
                upperIsLocal = isLocal(opts.UpperBound) || bytes.Equal(opts.UpperBound, keys.LocalMax)
                if opts.LowerBound != nil && lowerIsLocal != upperIsLocal {
                        panic(fmt.Sprintf(
                                "intentInterleavingIter cannot span from lowerIsLocal %t, %s to upperIsLocal %t, %s",
                                lowerIsLocal, opts.LowerBound.String(), upperIsLocal, opts.UpperBound.String()))
                }
                if upperIsLocal {
                        constraint = constrainedToLocal
                } else {
                        constraint = constrainedToGlobal
                }
        }
        if !opts.Prefix {
                if opts.LowerBound == nil && opts.UpperBound == nil {
                        // This is the same requirement as pebbleIterator.
                        panic("iterator must set prefix or upper bound or lower bound")
                }
                // At least one bound is specified, so constraint != notConstrained. But
                // may need to manufacture a bound for the currently unbounded side.
                if opts.LowerBound == nil && constraint == constrainedToGlobal {
                        // Iterating over global keys, and need a lower-bound, to prevent the MVCCIterator
                        // from iterating into the lock table.
                        opts.LowerBound = keys.LocalMax
                }
                if opts.UpperBound == nil && constraint == constrainedToLocal {
                        // Iterating over local keys, and need an upper-bound, to prevent the MVCCIterator
                        // from iterating into the lock table.
                        opts.UpperBound = keys.LocalRangeLockTablePrefix
                }
        }
        // Else prefix iteration, so do not need to manufacture bounds for both
        // iterators since the pebble.Iterator implementation will hide the keys
        // that do not match the prefix. Note that this is not equivalent to
        // constraint==notConstrained -- it is acceptable for a caller to specify a
        // bound for prefix iteration, though since they don't need to, most callers
        // don't.

        // There cannot be any range keys across the lock table, so create the intent
        // iterator for point keys only, or return a normal MVCC iterator if only
        // range keys are requested.
        if opts.KeyTypes == IterKeyTypeRangesOnly {
                return reader.NewMVCCIterator(ctx, MVCCKeyIterKind, opts)
        }

        iiIter := intentInterleavingIterPool.Get().(*intentInterleavingIter)
        intentKeyBuf := iiIter.intentKeyBuf
        intentLimitKeyBuf := iiIter.intentLimitKeyBuf

        ltOpts := LockTableIteratorOptions{
                Prefix: opts.Prefix, MatchMinStr: lock.Intent, ReadCategory: opts.ReadCategory}
        if opts.LowerBound != nil {
                ltOpts.LowerBound, intentKeyBuf = keys.LockTableSingleKey(opts.LowerBound, intentKeyBuf)
        } else if !opts.Prefix {
                // Make sure we don't step outside the lock table key space. Note that
                // this is the case where the lower bound was not set and
                // constrainedToLocal.
                ltOpts.LowerBound = keys.LockTableSingleKeyStart
        }
        if opts.UpperBound != nil {
                ltOpts.UpperBound, intentLimitKeyBuf =
                        keys.LockTableSingleKey(opts.UpperBound, intentLimitKeyBuf)
        } else if !opts.Prefix {
                // Make sure we don't step outside the lock table key space. Note that
                // this is the case where the upper bound was not set and
                // constrainedToGlobal.
                ltOpts.UpperBound = keys.LockTableSingleKeyEnd
        }

        // Note that we can reuse intentKeyBuf, intentLimitKeyBuf after
        // NewLockTableIter returns.
        intentIter, err := NewLockTableIterator(ctx, reader, ltOpts)
        if err != nil {
                return nil, err
        }

        // The creation of these iterators can race with concurrent mutations, which
        // may make them inconsistent with each other. So we clone here, to ensure
        // consistency (certain Reader implementations already ensure consistency,
        // and we use that when possible to save allocations).
        var iter *pebbleIterator
        if reader.ConsistentIterators() {
                mvccIter, err := reader.NewMVCCIterator(ctx, MVCCKeyIterKind, opts)
                if err != nil {
                        return nil, err
                }
                iter = maybeUnwrapUnsafeIter(mvccIter).(*pebbleIterator)
        } else {
                iter = newPebbleIteratorByCloning(ctx, intentIter.CloneContext(), opts, StandardDurability)
        }

        *iiIter = intentInterleavingIter{
                prefix:                               opts.Prefix,
                constraint:                           constraint,
                iter:                                 iter,
                intentIter:                           intentIter,
                intentKeyAsNoTimestampMVCCKeyBacking: iiIter.intentKeyAsNoTimestampMVCCKeyBacking,
                intentKeyBuf:                         intentKeyBuf,
                intentLimitKeyBuf:                    intentLimitKeyBuf,
        }
        return iiIter, nil
}

// TODO(sumeer): the limits generated below are tight for the current value of
// i.iterKey.Key. And the semantics of the underlying *WithLimit methods in
// pebble.Iterator are best-effort, but the implementation is not. Consider
// strengthening the semantics and using the tightness of these limits to
// avoid comparisons between iterKey and intentKey.

// makeUpperLimitKey uses the current value of i.iterKey.Key (and assumes
// i.iterValid=true), to construct an exclusive upper limit roachpb.Key that
// will include the intent for i.iterKey.Key.
func (i *intentInterleavingIter) makeUpperLimitKey() roachpb.Key {
        key := i.iterKey.Key
        // The +2 is to account for the call to BytesNext and the need to append a
        // '\x00' in the implementation of the *WithLimit function. The rest is the
        // same as in the implementation of LockTableSingleKey. The BytesNext is to
        // construct the exclusive roachpb.Key as mentioned earlier. The
        // implementation of *WithLimit (in pebbleIterator), has to additionally
        // append '\x00' (the sentinel byte) to construct an encoded EngineKey with
        // an empty version.
        keyLen :=
                len(keys.LocalRangeLockTablePrefix) + len(keys.LockTableSingleKeyInfix) + len(key) + 3 + 2
        if cap(i.intentLimitKeyBuf) < keyLen {
                i.intentLimitKeyBuf = make([]byte, 0, keyLen)
        }
        _, i.intentLimitKeyBuf = keys.LockTableSingleKey(key, i.intentLimitKeyBuf)
        // To construct the exclusive limitKey, roachpb.BytesNext gives us a
        // tight limit. Since it appends \x00, this is not decodable, except at
        // the Pebble level, which is all we need here. We don't actually use
        // BytesNext since it tries not to overwrite the slice.
        i.intentLimitKeyBuf = append(i.intentLimitKeyBuf, '\x00')
        return i.intentLimitKeyBuf
}

// makeLowerLimitKey uses the current value of i.iterKey.Key (and assumes
// i.iterValid=true), to construct an inclusive lower limit roachpb.Key that
// will include the intent for i.iterKey.Key.
func (i *intentInterleavingIter) makeLowerLimitKey() roachpb.Key {
        key := i.iterKey.Key
        // The +1 is to account for the need to append a '\x00' in the
        // implementation of the *WithLimit function. The rest is the same as in the
        // implementation of LockTableSingleKey.  The implementation of *WithLimit
        // (in pebbleIterator), has to additionally append '\x00' (the sentinel
        // byte) to construct an encoded EngineKey with an empty version.
        keyLen :=
                len(keys.LocalRangeLockTablePrefix) + len(keys.LockTableSingleKeyInfix) + len(key) + 3 + 1
        if cap(i.intentLimitKeyBuf) < keyLen {
                i.intentLimitKeyBuf = make([]byte, 0, keyLen)
        }
        _, i.intentLimitKeyBuf = keys.LockTableSingleKey(key, i.intentLimitKeyBuf)
        return i.intentLimitKeyBuf
}

// maybeSkipIntentRangeKey will step iter once forwards if iter is positioned on
// a bare range key with the same key position (either start key or seek key) as
// the current intentIter intent.
//
// This is necessary when intentIter lands on a new intent, to ensure iter is
// positioned on the provisional value instead of the bare range key. This must
// be done after positioning both iterators.
//
// NB: This is called before computePos(), and can't rely on intentCmp.
//
// REQUIRES: i.dir > 0
//
// gcassert:inline
func (i *intentInterleavingIter) maybeSkipIntentRangeKey() error {
        if util.RaceEnabled && i.dir < 0 {
                i.err = errors.AssertionFailedf("maybeSkipIntentRangeKey called in reverse")
                i.valid = false
                return i.err
        }
        if i.iterValid && i.intentKey != nil {
                return i.doMaybeSkipIntentRangeKey()
        }
        return nil
}

// doMaybeSkipIntentRangeKey is a helper for maybeSkipIntentRangeKey(), which
// allows mid-stack inlining of the former.
func (i *intentInterleavingIter) doMaybeSkipIntentRangeKey() error {
        if hasPoint, hasRange := i.iter.HasPointAndRange(); hasRange && !hasPoint {
                // iter may be on a bare range key that will cover the provisional value,
                // in which case we can step onto it. We guard against emitting the wrong
                // range key for the intent if the provisional value turns out to be
                // missing by:
                //
                // 1. Before we step, make sure iter isn't ahead of intentIter. We have
                //    to do a key comparison anyway in case intentIter is ahead of iter.
                // 2. After we step, make sure we're on a point key covered by a range key.
                //    We don't need a key comparison (but do so under race), because if
                //    the provisional value is missing then we'll either land on a
                //    different point key below the range key (which will emit the
                //    correct range key), or we'll land on a different bare range key.
                //
                // TODO(erikgrinaker): in cases where we don't step iter, we can save
                // the result of the comparison in i.intentCmp to avoid another one.
                if intentCmp := i.intentKey.Compare(i.iterKey.Key); intentCmp < 0 {
                        i.err = errors.Errorf("iter ahead of provisional value for intent %s (at %s)",
                                i.intentKey, i.iterKey)
                        i.valid = false
                        return i.err
                } else if intentCmp == 0 {
                        i.iter.Next()
                        if err := i.tryDecodeKey(); err != nil {
                                return err
                        }
                        hasPoint, hasRange = i.iter.HasPointAndRange()
                        if !hasPoint || !hasRange {
                                i.err = errors.Errorf("iter not on provisional value for intent %s", i.intentKey)
                                i.valid = false
                                return i.err
                        }
                }
        }
        return nil
}

// maybeSuppressRangeKeyChanged will suppress i.rangeKeyChanged in the reverse
// direction if the underlying iterator has moved past an intent onto a
// different range key that should not be surfaced yet. Must be called after
// computePos().
//
// gcassert:inline
func (i *intentInterleavingIter) maybeSuppressRangeKeyChanged() {
        if util.RaceEnabled && i.dir > 0 {
                panic(errors.AssertionFailedf("maybeSuppressRangeKeyChanged called in forward direction"))
        }
        // NB: i.intentCmp implies isCurAtIntentIterReverse(), but cheaper.
        if i.rangeKeyChanged && i.intentCmp > 0 {
                i.doMaybeSuppressRangeKeyChanged()
        }
}

// doMaybeSuppressRangeKeyChanged is a helper for maybeSuppressRangeKeyChanged
// which allows mid-stack inlining of the former.
func (i *intentInterleavingIter) doMaybeSuppressRangeKeyChanged() {
        i.rangeKeyChanged = i.iter.RangeBounds().EndKey.Compare(i.intentKey) > 0
}

// shouldAdjustSeekRangeKeyChanged returns true if a seek (any kind) needs to
// adjust the RangeKeyChanged signal from the underlying iter. This is necessary
// when intentInterleavingIter was previously positioned on an intent in the
// reverse direction, with iter positioned on a previous range key that did
// not overlap the point key (suppressed via suppressRangeKeyChanged).
//
// In this case, a seek may incorrectly emit or omit a RangeKeyChanged signal
// when iter is seeked, since it's relative to iter's former position rather
// than the intent's (and thus intentInterleavingIter's) position.
//
// This situation is only possible when the intent does not overlap any range
// keys (otherwise, iter would either have stopped at a point key which overlaps
// the same range key, or at the range key's start bound). Thus, when this
// situation occurs, RangeKeyChanged must be set equal to iter's hasRange value
// after the seek: we know we were not previously positioned on a range key, so
// if hasRange is true then RangeKeyChanged must be true (we seeked onto a range
// key), and if hasRange is false then RangeKeyChanged must be false (we did not
// seek onto a range key).
//
// gcassert:inline
func (i *intentInterleavingIter) shouldAdjustSeekRangeKeyChanged() bool {
        if i.dir == -1 && i.intentCmp > 0 && i.valid && i.iterValid {
                return i.doShouldAdjustSeekRangeKeyChanged()
        }
        return false
}

// doShouldAdjustSeekRangeKeyChanged is a shouldAdjustSeekRangeKeyChanged
// helper, which allows mid-stack inlining of the former.
func (i *intentInterleavingIter) doShouldAdjustSeekRangeKeyChanged() bool {
        if _, iterHasRange := i.iter.HasPointAndRange(); iterHasRange {
                if _, hasRange := i.HasPointAndRange(); !hasRange {
                        return true
                }
        }
        return false
}

// adjustSeekRangeKeyChanged adjusts i.rangeKeyChanged as described in
// shouldAdjustSeekRangeKeyChanged.
func (i *intentInterleavingIter) adjustSeekRangeKeyChanged() {
        if i.iterValid {
                _, hasRange := i.iter.HasPointAndRange()
                i.rangeKeyChanged = hasRange
        } else {
                i.rangeKeyChanged = false
        }
}

func (i *intentInterleavingIter) SeekGE(key MVCCKey) {
        adjustRangeKeyChanged := i.shouldAdjustSeekRangeKeyChanged()

        i.dir = +1
        i.valid = true
        i.err = nil

        if i.constraint != notConstrained {
                i.checkConstraint(key.Key, false)
        }
        i.iter.SeekGE(key)
        if err := i.tryDecodeKey(); err != nil {
                return
        }
        i.rangeKeyChanged = i.iter.RangeKeyChanged()
        if adjustRangeKeyChanged {
                i.adjustSeekRangeKeyChanged()
        }
        var intentSeekKey roachpb.Key
        if key.Timestamp.IsEmpty() {
                // Common case.
                intentSeekKey, i.intentKeyBuf = keys.LockTableSingleKey(key.Key, i.intentKeyBuf)
        } else if !i.prefix {
                // Seeking to a specific version, so go past the intent.
                intentSeekKey, i.intentKeyBuf = keys.LockTableSingleNextKey(key.Key, i.intentKeyBuf)
        } else {
                // Else seeking to a particular version and using prefix iteration,
                // so don't expect to ever see the intent. NB: intentSeekKey is nil.
                i.intentKey = nil
        }
        if !i.iterValid && i.prefix {
                // The prefix seek below will also certainly fail, as we didn't find an
                // MVCC value here.
                intentSeekKey = nil
                i.intentKey = nil
        }
        if intentSeekKey != nil {
                var limitKey roachpb.Key
                if i.iterValid && !i.prefix {
                        limitKey = i.makeUpperLimitKey()
                }
                iterState, err := i.intentIter.SeekEngineKeyGEWithLimit(EngineKey{Key: intentSeekKey}, limitKey)
                if err = i.tryDecodeLockKey(iterState, err); err != nil {
                        return
                }
                if err := i.maybeSkipIntentRangeKey(); err != nil {
                        return
                }
        }
        i.computePos()
}

func (i *intentInterleavingIter) checkConstraint(k roachpb.Key, isExclusiveUpper bool) {
        kConstraint := constrainedToGlobal
        if isLocal(k) {
                if bytes.Compare(k, keys.LocalRangeLockTablePrefix) > 0 {
                        panic(fmt.Sprintf("intentInterleavingIter cannot be used with invalid local keys %s",
                                k.String()))
                }
                kConstraint = constrainedToLocal
        } else if isExclusiveUpper && bytes.Equal(k, keys.LocalMax) {
                kConstraint = constrainedToLocal
        }
        if kConstraint != i.constraint {
                panic(fmt.Sprintf(
                        "iterator with constraint=%d is being used with key %s that has constraint=%d",
                        i.constraint, k.String(), kConstraint))
        }
}

func (i *intentInterleavingIter) tryDecodeKey() error {
        i.iterValid, i.err = i.iter.Valid()
        if i.iterValid {
                i.iterKey = i.iter.UnsafeKey()
        }
        if i.err != nil {
                i.valid = false
        }
        return i.err
}

// Assumes that i.err != nil. And i.iterValid and i.iterKey are up to date.
func (i *intentInterleavingIter) computePos() {
        if !i.iterValid && i.intentKey == nil {
                i.valid = false
                return
        }
        // INVARIANT: i.iterValid || i.intentKey != nil
        if !i.iterValid {
                i.intentCmp = -i.dir
                return
        }
        if i.intentKey == nil {
                i.intentCmp = i.dir
        } else {
                i.intentCmp = i.intentKey.Compare(i.iterKey.Key)
                if i.intentCmp == 0 {
                        // We have to handle the case where intentIter is on an intent and iter is
                        // on a bare range key at the same key position.
                        //
                        // In the forward direction, this should never happen: the caller should
                        // have called maybeSkipIntentRangeKey() to step onto the provisional
                        // value (or a later key, if the provisional value is absent, which we
                        // will check later). The provisional value will be covered by the same
                        // range keys as the intent.
                        //
                        // In the reverse direction, there are two cases:
                        //
                        // In the typical case, iter will be on the range key's unversioned start
                        // key. We cannot move past this to satisfy intentCmp < 0 (the usual
                        // condition for isCurAtIntentIter), because we need to expose those range
                        // keys via e.g. RangeKeys().
                        //
                        // However, there is also the case where we're on a versioned key position
                        // following a versioned SeekGE call, i.e. we're in the middle of
                        // switching directions during a Prev() call. For example, we're on
                        // position b@3 of [a-c)@3 with an intent at b. In this case, we should
                        // not be considered located on the intent yet -- we'll land on it after a
                        // subsequent Prev() call.
                        //
                        // We track this as iterBareRangeKeyAtIntent, assuming intentCmp == 0:
                        //
                        // hasRange && !hasPoint && Timestamp.IsEmpty()
                        if i.dir > 0 {
                                i.iterBareRangeAtIntent = false
                        } else {
                                hasPoint, hasRange := i.iter.HasPointAndRange()
                                i.iterBareRangeAtIntent = !hasPoint && hasRange && i.iterKey.Timestamp.IsEmpty()
                        }
                }
        }
}

func (i *intentInterleavingIter) tryDecodeLockKey(
        iterState pebble.IterValidityState, err error,
) error {
        if err != nil {
                i.err = err
                i.valid = false
                return err
        }
        i.intentIterState = iterState
        if iterState != pebble.IterValid {
                // NB: this does not set i.valid = false, since this method does not care
                // about the state of i.iter, which may be valid. It is the caller's
                // responsibility to additionally use the state of i.iter to appropriately
                // set i.valid.
                i.intentKey = nil
                return nil
        }
        engineKey, err := i.intentIter.UnsafeEngineKey()
        if err != nil {
                i.err = err
                i.valid = false
                return err
        }
        if i.intentKey, err = keys.DecodeLockTableSingleKey(engineKey.Key); err != nil {
                i.err = err
                i.valid = false
                return err
        }
        // If we were to encode MVCCKey{Key: i.intentKey}, i.e., encode it as an
        // MVCCKey with no timestamp, the encoded bytes would be intentKey + \x00.
        // Such an encoding is needed by callers of UnsafeRawMVCCKey. We would like
        // to avoid copying the bytes in intentKey, if possible, for this encoding.
        // Fortunately, the common case in the above call of
        // DecodeLockTableSingleKey, that decodes intentKey from engineKey.Key, is
        // for intentKey to not need un-escaping, so it will point to the slice that
        // was backing engineKey.Key. engineKey.Key uses an encoding that terminates
        // the intent key using \x00\x01. So the \x00 we need is conveniently there.
        // This optimization also usually works when there is un-escaping, since the
        // slice growth algorithm usually ends up with a cap greater than len. Since
        // these extra bytes in the cap are 0-initialized, the first byte following
        // intentKey is \x00.
        //
        // If this optimization is not possible, we leave
        // intentKeyAsNoTimestampMVCCKey as nil, and lazily initialize it, if
        // needed.
        i.intentKeyAsNoTimestampMVCCKey = nil
        if cap(i.intentKey) > len(i.intentKey) {
                prospectiveKey := i.intentKey[:len(i.intentKey)+1]
                if prospectiveKey[len(i.intentKey)] == 0 {
                        i.intentKeyAsNoTimestampMVCCKey = prospectiveKey
                }
        }
        return nil
}

func (i *intentInterleavingIter) Valid() (bool, error) {
        if util.RaceEnabled && i.valid {
                if err := i.assertInvariants(); err != nil {
                        return false, err
                }
        }
        return i.valid, i.err
}

func (i *intentInterleavingIter) Next() {
        if i.err != nil {
                return
        }
        if i.dir < 0 {
                // Switching from reverse to forward iteration.
                if util.RaceEnabled && i.prefix {
                        panic(errors.AssertionFailedf("dir < 0 with prefix iteration"))
                }
                isCurAtIntent := i.isCurAtIntentIterReverse()
                i.dir = +1
                if !i.valid {
                        // Both iterators are exhausted. We know that this is non-prefix
                        // iteration, as reverse iteration is not supported with prefix
                        // iteration. Since intentKey is synchronized with intentIter for
                        // non-prefix iteration, step both forward.
                        i.valid = true
                        i.iter.Next()
                        if err := i.tryDecodeKey(); err != nil {
                                return
                        }
                        i.rangeKeyChanged = i.iter.RangeKeyChanged()
                        var limitKey roachpb.Key
                        if i.iterValid {
                                limitKey = i.makeUpperLimitKey()
                        }
                        iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey)
                        if err = i.tryDecodeLockKey(iterState, err); err != nil {
                                return
                        }
                        if err := i.maybeSkipIntentRangeKey(); err != nil {
                                return
                        }
                        i.computePos()
                        return
                }
                // At least one of the iterators is not exhausted.
                if isCurAtIntent {
                        // Reverse iteration was positioned at the intent, so either (a) iter
                        // precedes the intentIter, so must be at the lowest version of the
                        // preceding key or exhausted, or (b) iter is at a bare range key whose
                        // start key is colocated with the intent.
                        // Step iter forward. It will now point to
                        // a key that is the same as the intent key since an intent always has a
                        // corresponding provisional value, and provisional values must have a
                        // higher timestamp than any committed value on a key. Note that the
                        // code below does not specifically care if a bug (external to this
                        // code) violates the invariant that the iter is pointing to the
                        // provisional value, but it does care that iter is pointing to some
                        // version of that key.
                        i.iter.Next()
                        if err := i.tryDecodeKey(); err != nil {
                                return
                        }
                        i.rangeKeyChanged = i.iter.RangeKeyChanged()
                        i.intentCmp = 0
                        if !i.iterValid {
                                i.err = errors.Errorf("intent has no provisional value")
                                i.valid = false
                                return
                        }
                        if hasPoint, hasRange := i.iter.HasPointAndRange(); hasRange && !hasPoint {
                                // If there was a bare range key before the provisional value, iter
                                // would have been positioned there prior to the i.iter.Next() call,
                                // so it must now be at the provisional value, but it is not.
                                i.err = errors.Errorf("intent has no provisional value")
                                i.valid = false
                                return
                        }
                } else {
                        // The intentIter precedes the iter. It could be for the same key, iff
                        // this key has an intent, or an earlier key. Either way, stepping
                        // forward will take it to an intent for a later key.
                        limitKey := i.makeUpperLimitKey()
                        iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey)
                        if err = i.tryDecodeLockKey(iterState, err); err != nil {
                                return
                        }
                        // NB: doesn't need maybeSkipIntentRangeKey() as intentCmp > 0.
                        i.intentCmp = +1
                }
                // INVARIANT: i.valid
        }
        if !i.valid {
                return
        }
        if i.isCurAtIntentIterForward() {
                // The iterator is positioned at an intent in intentIter. iter must be
                // positioned at the provisional value. Note that the code below does not
                // specifically care if a bug (external to this code) violates the
                // invariant that the iter is pointing to the provisional value, but it
                // does care that iter is pointing to some version of that key.
                if i.intentCmp != 0 {
                        i.err = errors.Errorf("intentIter at intent, but iter not at provisional value")
                        i.valid = false
                        return
                }
                if !i.iterValid {
                        i.err = errors.Errorf("iter expected to be at provisional value, but is exhausted")
                        i.valid = false
                        return
                }
                var limitKey roachpb.Key
                if !i.prefix {
                        limitKey = i.makeUpperLimitKey()
                }
                iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey)
                if err = i.tryDecodeLockKey(iterState, err); err != nil {
                        return
                }
                i.rangeKeyChanged = false // already surfaced at the intent
                // NB: doesn't need maybeSkipIntentRangeKey() as intentCmp > 0.
                i.intentCmp = +1
        } else {
                // Common case:
                // The iterator is positioned at iter, at an MVCC value.
                i.iter.Next()
                if err := i.tryDecodeKey(); err != nil {
                        return
                }
                i.rangeKeyChanged = i.iter.RangeKeyChanged()
                if i.intentIterState == pebble.IterAtLimit && i.iterValid && !i.prefix {
                        // TODO(sumeer): could avoid doing this if i.iter has stepped to
                        // different version of same key.
                        limitKey := i.makeUpperLimitKey()
                        iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey)
                        if err = i.tryDecodeLockKey(iterState, err); err != nil {
                                return
                        }
                }
                // Whether we stepped the intentIter or not, we have stepped iter, and
                // iter could now be at a bare range key that is equal to the intentIter
                // key.
                if err := i.maybeSkipIntentRangeKey(); err != nil {
                        return
                }
                i.computePos()
        }
}

func (i *intentInterleavingIter) NextKey() {
        // NextKey is not called to switch directions, i.e., we must already
        // be in the forward direction.
        if i.dir < 0 {
                i.err = errors.Errorf("NextKey cannot be used to switch iteration direction")
                i.valid = false
                return
        }
        if !i.valid {
                return
        }
        if i.isCurAtIntentIterForward() {
                // The iterator is positioned at an intent in intentIter. iter must be
                // positioned at the provisional value.
                if i.intentCmp != 0 {
                        i.err = errors.Errorf("intentIter at intent, but iter not at provisional value")
                        i.valid = false
                        return
                }
                // Step the iter to NextKey(), i.e., past all the versions of this key.
                // Note that iter may already be exhausted, in which case calling NextKey
                // is a no-op.
                i.iter.NextKey()
                if err := i.tryDecodeKey(); err != nil {
                        return
                }
                i.rangeKeyChanged = i.iter.RangeKeyChanged()
                var limitKey roachpb.Key
                if i.iterValid && !i.prefix {
                        limitKey = i.makeUpperLimitKey()
                }
                iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey)
                if err := i.tryDecodeLockKey(iterState, err); err != nil {
                        return
                }
                if err := i.maybeSkipIntentRangeKey(); err != nil {
                        return
                }
                i.computePos()
                return
        }
        // Common case:
        // The iterator is positioned at iter, i.e., at a MVCC value.
        // Step the iter to NextKey(), i.e., past all the versions of this key.
        i.iter.NextKey()
        if err := i.tryDecodeKey(); err != nil {
                return
        }
        i.rangeKeyChanged = i.iter.RangeKeyChanged()
        if i.intentIterState == pebble.IterAtLimit && i.iterValid && !i.prefix {
                limitKey := i.makeUpperLimitKey()
                iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey)
                if err = i.tryDecodeLockKey(iterState, err); err != nil {
                        return
                }
        }
        if err := i.maybeSkipIntentRangeKey(); err != nil {
                return
        }
        i.computePos()
}

// TODO(erikgrinaker): Consider computing this once and storing it as a struct
// field when repositioning the iterator, instead of repeatedly calling it. The
// forward/reverse methods are called at least once per step, with two more
// calls for UnsafeKey() and UnsafeValue(), and this has a measurable cost
// (especially in the reverse direction).
//
// gcassert:inline
func (i *intentInterleavingIter) isCurAtIntentIter() bool {
        // When both iter and intentIter are exhausted, the return value is
        // immaterial since this function won't be called. We examine the remaining
        // cases below.
        //
        // During forward iteration (dir > 0), we have the following cases:
        // - iter is exhausted: intentCmp < 0. This will never happen and callers
        //   check. Returns true.
        // - intentIter is exhausted: intentCmp > 0. Returns false.
        // - Neither is exhausted:
        //   - intentCmp < 0. This will never happen and callers check. Returns true.
        //   - intentCmp = 0. Returns true.
        //   - intentCmp > 0. Returns false.
        //
        // During reverse iteration (dir < 0), we have the following cases:
        // - iter is exhausted: intentCmp > 0. Returns true.
        // - intentIter is exhausted: intentCmp < 0. Returns false.
        // - Neither is exhausted:
        //   - intentCmp > 0. Returns true.
        //   - intentCmp = 0. Returns false unless copositioned with bare range key.
        //   - intentCmp < 0. Returns false.
        return (i.dir > 0 && i.isCurAtIntentIterForward()) || (i.dir < 0 && i.isCurAtIntentIterReverse())
}

// gcassert:inline
func (i *intentInterleavingIter) isCurAtIntentIterForward() bool {
        return i.intentCmp <= 0
}

// gcassert:inline
func (i *intentInterleavingIter) isCurAtIntentIterReverse() bool {
        return i.intentCmp > 0 || (i.intentCmp == 0 && i.iterBareRangeAtIntent)
}

func (i *intentInterleavingIter) UnsafeKey() MVCCKey {
        if i.isCurAtIntentIter() {
                return MVCCKey{Key: i.intentKey}
        }
        return i.iterKey
}

func (i *intentInterleavingIter) UnsafeValue() ([]byte, error) {
        if i.isCurAtIntentIter() {
                return i.intentIter.UnsafeValue()
        }
        return i.iter.UnsafeValue()
}

func (i *intentInterleavingIter) UnsafeLazyValue() pebble.LazyValue {
        if i.isCurAtIntentIter() {
                return i.intentIter.UnsafeLazyValue()
        }
        return i.iter.UnsafeLazyValue()
}

func (i *intentInterleavingIter) MVCCValueLenAndIsTombstone() (int, bool, error) {
        if i.isCurAtIntentIter() {
                return 0, false, errors.Errorf("not at MVCC value")
        }
        return i.iter.MVCCValueLenAndIsTombstone()
}

func (i *intentInterleavingIter) ValueLen() int {
        if i.isCurAtIntentIter() {
                return i.intentIter.ValueLen()
        }
        return i.iter.ValueLen()
}

func (i *intentInterleavingIter) Value() ([]byte, error) {
        if i.isCurAtIntentIter() {
                return i.intentIter.Value()
        }
        return i.iter.Value()
}

// HasPointAndRange implements SimpleMVCCIterator.
func (i *intentInterleavingIter) HasPointAndRange() (bool, bool) {
        var hasPoint, hasRange bool
        if i.iterValid {
                hasPoint, hasRange = i.iter.HasPointAndRange()
        }
        if i.isCurAtIntentIter() {
                hasPoint = true
                // In the reverse direction, if the intent itself does not overlap a range
                // key, then iter may be positioned on an earlier range key. Otherwise, iter
                // will always be positioned on the correct range key.
                //
                // Note the following implications:
                //
                //   hasRange → i.iterValid
                //   i.isCurAtIntentIter() && i.dir < 0 → i.intentCmp > 0 ||
                //                (i.intentCmp == 0 && i.iterBareRangeAtIntent)
                //
                // TODO(erikgrinaker): consider optimizing this comparison.
                if hasRange && i.dir < 0 {
                        hasRange = i.intentCmp == 0 || i.iter.RangeBounds().EndKey.Compare(i.intentKey) > 0
                }
        }
        return hasPoint, hasRange
}

// RangeBounds implements SimpleMVCCIterator.
func (i *intentInterleavingIter) RangeBounds() roachpb.Span {
        return i.iter.RangeBounds()
}

// RangeKeys implements SimpleMVCCIterator.
func (i *intentInterleavingIter) RangeKeys() MVCCRangeKeyStack {
        if _, hasRange := i.HasPointAndRange(); !hasRange {
                return MVCCRangeKeyStack{}
        }
        return i.iter.RangeKeys()
}

// RangeKeyChanged implements SimpleMVCCIterator.
func (i *intentInterleavingIter) RangeKeyChanged() bool {
        return i.rangeKeyChanged
}

func (i *intentInterleavingIter) Close() {
        i.iter.Close()
        i.intentIter.Close()
        *i = intentInterleavingIter{
                intentKeyAsNoTimestampMVCCKeyBacking: i.intentKeyAsNoTimestampMVCCKeyBacking,
                intentKeyBuf:                         i.intentKeyBuf,
                intentLimitKeyBuf:                    i.intentLimitKeyBuf,
        }
        intentInterleavingIterPool.Put(i)
}

func (i *intentInterleavingIter) SeekLT(key MVCCKey) {
        adjustRangeKeyChanged := i.shouldAdjustSeekRangeKeyChanged()

        i.dir = -1
        i.valid = true
        i.err = nil

        if i.prefix {
                i.err = errors.Errorf("prefix iteration is not permitted with SeekLT")
                i.valid = false
                return
        }
        if i.constraint != notConstrained {
                // If the seek key of SeekLT is the boundary between the local and global
                // keyspaces, iterators constrained in either direction are permitted.
                // Iterators constrained to the local keyspace may be scanning from their
                // upper bound. Iterators constrained to the global keyspace may have found
                // a key on the boundary and may now be scanning before the key, using the
                // boundary as an exclusive upper bound.
                // NB: an iterator with bounds [L, U) is allowed to SeekLT over any key in
                // [L, U]. For local keyspace iterators, U can be LocalMax and for global
                // keyspace iterators L can be LocalMax.
                localMax := bytes.Equal(key.Key, keys.LocalMax)
                if !localMax {
                        i.checkConstraint(key.Key, true)
                }
                if localMax && i.constraint == constrainedToLocal {
                        // Move it down to below the lock table so can iterate down cleanly into
                        // the local key space. Note that we disallow anyone using a seek key
                        // that is a local key above the lock table, and there should be no keys
                        // in the engine there either (at least not keys that we need to see using
                        // an MVCCIterator).
                        key.Key = keys.LocalRangeLockTablePrefix
                }
        }

        i.iter.SeekLT(key)
        if err := i.tryDecodeKey(); err != nil {
                return
        }
        var intentSeekKey roachpb.Key
        if key.Timestamp.IsEmpty() {
                // Common case.
                intentSeekKey, i.intentKeyBuf = keys.LockTableSingleKey(key.Key, i.intentKeyBuf)
        } else {
                // Seeking to a specific version, so need to see the intent. Since we need
                // to see the intent for key.Key, and we don't have SeekLE, call Next() on
                // the key before doing SeekLT.
                intentSeekKey, i.intentKeyBuf = keys.LockTableSingleNextKey(key.Key, i.intentKeyBuf)
        }
        var limitKey roachpb.Key
        if i.iterValid {
                limitKey = i.makeLowerLimitKey()
        }
        iterState, err := i.intentIter.SeekEngineKeyLTWithLimit(EngineKey{Key: intentSeekKey}, limitKey)
        if err = i.tryDecodeLockKey(iterState, err); err != nil {
                return
        }
        i.computePos()
        i.rangeKeyChanged = i.iter.RangeKeyChanged()
        if adjustRangeKeyChanged {
                i.adjustSeekRangeKeyChanged()
        }
        i.maybeSuppressRangeKeyChanged()
}

func (i *intentInterleavingIter) Prev() {
        if i.err != nil {
                return
        }
        // INVARIANT: !i.prefix
        if i.dir > 0 {
                // Switching from forward to reverse iteration.
                isCurAtIntent := i.isCurAtIntentIterForward()
                i.dir = -1
                if !i.valid {
                        // Both iterators are exhausted, so step both backward.
                        i.valid = true
                        i.iter.Prev()
                        if err := i.tryDecodeKey(); err != nil {
                                return
                        }
                        var limitKey roachpb.Key
                        if i.iterValid {
                                limitKey = i.makeLowerLimitKey()
                        }
                        iterState, err := i.intentIter.PrevEngineKeyWithLimit(limitKey)
                        if err = i.tryDecodeLockKey(iterState, err); err != nil {
                                return
                        }
                        i.computePos()
                        i.rangeKeyChanged = i.iter.RangeKeyChanged()
                        i.maybeSuppressRangeKeyChanged()
                        return
                }
                // At least one of the iterators is not exhausted.
                if isCurAtIntent {
                        // iter is after the intentIter, so must be at the provisional value.
                        // Step it backward. It will now point to a key that is before the
                        // intent key, or a range key whose start key is colocated with the
                        // intent, or be exhausted.
                        //
                        // Note that the code below does not specifically care if a bug (external
                        // to this code) violates the invariant that the provisional value is the
                        // highest timestamp key, but it does care that there is a timestamped
                        // value for this key (which it checks below). The internal invariant of
                        // this iterator implementation will ensure that iter is pointing to the
                        // highest timestamped key.
                        if i.intentCmp != 0 {
                                i.err = errors.Errorf("iter not at provisional value, cmp: %d", i.intentCmp)
                                i.valid = false
                                return
                        }
                        i.iter.Prev()
                        if err := i.tryDecodeKey(); err != nil {
                                return
                        }
                        i.computePos()
                        // TODO(sumeer): These calls to initialize and suppress rangeKeyChanged
                        // are unnecessary since i.valid is true and we will overwrite tnis work
                        // later in this function.
                        i.rangeKeyChanged = i.iter.RangeKeyChanged()
                        i.maybeSuppressRangeKeyChanged()
                } else {
                        // The intentIter is after the iter. We don't know whether the iter key
                        // has an intent. Note that the iter could itself be positioned at an
                        // intent.
                        limitKey := i.makeLowerLimitKey()
                        iterState, err := i.intentIter.PrevEngineKeyWithLimit(limitKey)
                        if err = i.tryDecodeLockKey(iterState, err); err != nil {
                                return
                        }
                        i.computePos()
                        // TODO(sumeer): This call to suppress rangeKeyChanged is unnecessary
                        // since i.valid is true and we will overwrite tnis work later in this
                        // function.
                        i.maybeSuppressRangeKeyChanged()
                }
                // INVARIANT: i.valid
        }
        if !i.valid {
                return
        }
        if i.isCurAtIntentIterReverse() {
                // The iterator is positioned at an intent in intentIter, and iter is
                // exhausted, positioned at a versioned value of a preceding key, or
                // positioned on the start of a range key colocated with the intent.
                // Stepping intentIter backward will ensure that intentKey is <= the key
                // of iter (when neither is exhausted), but we may also need to step
                // off the bare range key if there is one, and account for the fact
                // that the range key may have already changed on the intent.
                if i.iterBareRangeAtIntent {
                        i.iter.Prev()
                        if err := i.tryDecodeKey(); err != nil {
                                return
                        }
                }
                // Two cases:
                // - i.iterBareRangeAtIntent: we have stepped iter backwards, and since
                //   we will no longer be at the intent, i.iter.RangeKeyChanged() should
                //   be used as the value of i.rangeKeyChanged.
                // - !i.iterBareRangeAtIntent: we have not stepped iter. If the range
                //   bounds of iter covered the current intent, we have already shown them
                //   to the client. So the only reason for i.rangeKeyChanged to be true is
                //   if the range bounds do not cover the current intent. That is the
                //   i.iter.RangeBounds().EndKey.Compare(i.intentKey) <= 0 condition
                //   below.
                i.rangeKeyChanged = i.iter.RangeKeyChanged() && (i.iterBareRangeAtIntent ||
                        i.iter.RangeBounds().EndKey.Compare(i.intentKey) <= 0)
                var limitKey roachpb.Key
                if i.iterValid {
                        limitKey = i.makeLowerLimitKey()
                }
                intentIterState, err := i.intentIter.PrevEngineKeyWithLimit(limitKey)
                if err = i.tryDecodeLockKey(intentIterState, err); err != nil {
                        return
                }
                if !i.iterValid {
                        // It !i.iterValid, the intentIter can no longer be valid either.
                        // Note that limitKey is nil in this case.
                        if intentIterState != pebble.IterExhausted {
                                i.err = errors.Errorf("reverse iteration discovered intent without provisional value")
                        }
                        i.valid = false
                        return
                }
                // iterValid == true. So positioned at iter.
                i.intentCmp = -1
                if i.intentKey != nil {
                        i.computePos()
                        if i.intentCmp > 0 {
                                i.err = errors.Errorf("intentIter should not be after iter")
                                i.valid = false
                                return
                        }
                        // INVARIANT: i.intentCmp <= 0. So this call to
                        // maybeSuppressRangeKeyChanged() will be a no-op.
                        i.maybeSuppressRangeKeyChanged()
                }
        } else {
                // Common case:
                // The iterator is positioned at iter, i.e., at a MVCC value.
                i.iter.Prev()
                if err := i.tryDecodeKey(); err != nil {
                        return
                }
                if i.intentIterState == pebble.IterAtLimit && i.iterValid {
                        // TODO(sumeer): could avoid doing this if i.iter has stepped to
                        // different version of same key.
                        limitKey := i.makeLowerLimitKey()
                        iterState, err := i.intentIter.PrevEngineKeyWithLimit(limitKey)
                        if err = i.tryDecodeLockKey(iterState, err); err != nil {
                                return
                        }
                }
                i.computePos()
                i.rangeKeyChanged = i.iter.RangeKeyChanged()
                i.maybeSuppressRangeKeyChanged()
        }
}

func (i *intentInterleavingIter) UnsafeRawKey() []byte {
        if i.isCurAtIntentIter() {
                return i.intentIter.UnsafeRawEngineKey()
        }
        return i.iter.UnsafeRawKey()
}

func (i *intentInterleavingIter) UnsafeRawMVCCKey() []byte {
        if i.isCurAtIntentIter() {
                if i.intentKeyAsNoTimestampMVCCKey == nil {
                        // Slow-path: tryDecodeLockKey was not able to initialize.
                        if cap(i.intentKeyAsNoTimestampMVCCKeyBacking) < len(i.intentKey)+1 {
                                i.intentKeyAsNoTimestampMVCCKeyBacking = make([]byte, 0, len(i.intentKey)+1)
                        }
                        i.intentKeyAsNoTimestampMVCCKeyBacking = append(
                                i.intentKeyAsNoTimestampMVCCKeyBacking[:0], i.intentKey...)
                        // Append the 0 byte representing the absence of a timestamp.
                        i.intentKeyAsNoTimestampMVCCKeyBacking = append(
                                i.intentKeyAsNoTimestampMVCCKeyBacking, 0)
                        i.intentKeyAsNoTimestampMVCCKey = i.intentKeyAsNoTimestampMVCCKeyBacking
                }
                return i.intentKeyAsNoTimestampMVCCKey
        }
        return i.iter.UnsafeRawKey()
}

func (i *intentInterleavingIter) ValueProto(msg protoutil.Message) error {
        value, err := i.UnsafeValue()
        if err != nil {
                return err
        }
        return protoutil.Unmarshal(value, msg)
}

func (i *intentInterleavingIter) FindSplitKey(
        start, end, minSplitKey roachpb.Key, targetSize int64,
) (MVCCKey, error) {
        return findSplitKeyUsingIterator(i, start, end, minSplitKey, targetSize)
}

func (i *intentInterleavingIter) Stats() IteratorStats {
        stats := i.iter.Stats()
        intentStats := i.intentIter.Stats()
        stats.Stats.Merge(intentStats.Stats)
        return stats
}

// IsPrefix implements the MVCCIterator interface.
func (i *intentInterleavingIter) IsPrefix() bool {
        return i.prefix
}

// assertInvariants asserts internal iterator invariants, returning an
// AssertionFailedf for any violations. It must be called on a valid iterator
// after a complete state transition.
func (i *intentInterleavingIter) assertInvariants() error {
        // Assert general MVCCIterator invariants.
        if err := assertMVCCIteratorInvariants(i); err != nil {
                return err
        }

        // The underlying iterator must not have errored.
        iterValid, err := i.iter.Valid()
        if err != nil {
                return errors.NewAssertionErrorWithWrappedErrf(err, "valid iter but i.iter errored")
        }
        intentValid := i.intentKey != nil

        // At least one of the iterators must be valid. The iterator's validity state
        // should match i.iterValid.
        if !iterValid && !intentValid {
                return errors.AssertionFailedf("i.valid=%t but both iterators are invalid", i.valid)
        }
        if iterValid != i.iterValid {
                return errors.AssertionFailedf("i.iterValid=%t but i.iter.Valid=%t", i.iterValid, iterValid)
        }

        // i.dir must be either 1 or -1.
        if i.dir != 1 && i.dir != -1 {
                return errors.AssertionFailedf("i.dir=%v is not valid", i.dir)
        }

        // For valid iterators, the stored key must match the iterator key.
        if iterValid {
                if key := i.iter.UnsafeKey(); !i.iterKey.Equal(key) {
                        return errors.AssertionFailedf("i.iterKey=%q does not match i.iter.UnsafeKey=%q",
                                i.iterKey, key)
                }
        }
        if intentValid {
                intentKey := i.intentKey.Clone()
                if engineKey, err := i.intentIter.UnsafeEngineKey(); err != nil {
                        return errors.NewAssertionErrorWithWrappedErrf(err, "valid i.intentIter errored")
                } else if !engineKey.IsLockTableKey() {
                        return errors.AssertionFailedf("i.intentIter on non-locktable key %s", engineKey)
                } else if key, err := keys.DecodeLockTableSingleKey(engineKey.Key); err != nil {
                        return errors.NewAssertionErrorWithWrappedErrf(err, "failed to decode lock table key %s",
                                engineKey)
                } else if !intentKey.Equal(key) {
                        return errors.AssertionFailedf("i.intentKey %q != i.intentIter.UnsafeEngineKey() %q",
                                intentKey, key)
                }
                // If i.intentKey is set (i.e. intentValid is true), then intentIterState
                // must be valid. The inverse is not always true.
                if i.intentIterState != pebble.IterValid {
                        return errors.AssertionFailedf("i.intentKey=%q, but i.intentIterState=%v is not IterValid",
                                i.intentKey, i.intentIterState)
                }
                // If i.intentKey is set, then i.intentKeyAsNoTimestampMVCCKey must either
                // be nil or equal to it with a \x00 byte appended.
                if i.intentKeyAsNoTimestampMVCCKey != nil &&
                        !bytes.Equal(i.intentKeyAsNoTimestampMVCCKey, append(i.intentKey.Clone(), 0)) {
                        return errors.AssertionFailedf(
                                "i.intentKeyAsNoTimestampMVCCKey=%q differs from i.intentKey=%q",
                                i.intentKeyAsNoTimestampMVCCKey, i.intentKey)
                }
        }

        // Check intentCmp depending on the iterator validity. We already know that
        // one of the iterators must be valid.
        if iterValid && intentValid {
                if cmp := i.intentKey.Compare(i.iterKey.Key); i.intentCmp != cmp {
                        return errors.AssertionFailedf("i.intentCmp=%v does not match %v for intentKey=%q iterKey=%q",
                                i.intentCmp, cmp, i.intentKey, i.iterKey)
                }
        } else if iterValid {
                if i.intentCmp != i.dir {
                        return errors.AssertionFailedf("i.intentCmp=%v != i.dir=%v for invalid i.intentIter",
                                i.intentCmp, i.dir)
                }
        } else if intentValid {
                if i.intentCmp != -i.dir {
                        return errors.AssertionFailedf("i.intentCmp=%v == i.dir=%v for invalid i.iter",
                                i.intentCmp, i.dir)
                }
        }

        // When on an intent in the forward direction, we must be on a provisional
        // value and any range key must cover it.
        if i.dir > 0 && i.isCurAtIntentIterForward() {
                if !iterValid {
                        return errors.AssertionFailedf(
                                "missing provisional value for i.intentKey=%q: i.iter exhausted", i.intentKey)
                } else if i.intentCmp != 0 {
                        return errors.AssertionFailedf(
                                "missing provisional value for i.intentKey=%q: i.intentCmp=%v is not 0",
                                i.intentKey, i.intentCmp)
                } else if hasPoint, hasRange := i.iter.HasPointAndRange(); !hasPoint {
                        return errors.AssertionFailedf(
                                "missing provisional value for i.intentKey=%q: i.iter on bare range key",
                                i.intentKey)
                } else if hasRange {
                        if bounds := i.iter.RangeBounds(); !bounds.ContainsKey(i.intentKey) {
                                return errors.AssertionFailedf("i.intentKey=%q not covered by i.iter range key %q",
                                        bounds, i.intentKey)
                        }
                }
        }

        // Check i.iterBareRangeAtIntent, which is only valid for i.intentCmp == 0.
        if i.intentCmp == 0 {
                if i.dir > 0 && i.iterBareRangeAtIntent {
                        return errors.AssertionFailedf("i.dir=%v can't have i.iterBareRangeAtIntent=%v",
                                i.dir, i.iterBareRangeAtIntent)
                }
                if i.dir < 0 && i.iterBareRangeAtIntent {
                        if hasPoint, hasRange := i.iter.HasPointAndRange(); hasPoint || !hasRange {
                                return errors.AssertionFailedf("i.iterBareRangeAtIntent=%v but hasPoint=%t hasRange=%t",
                                        i.iterBareRangeAtIntent, hasPoint, hasRange)
                        }
                        // We've already asserted key equality for i.intentCmp == 0.
                        if !i.iterKey.Timestamp.IsEmpty() {
                                return errors.AssertionFailedf("i.iterBareRangeAtIntent=%v but i.iterKey has timestamp %s",
                                        i.iterBareRangeAtIntent, i.iterKey.Timestamp)
                        }
                }
        }

        return nil
}

// unsafeMVCCIterator is used in RaceEnabled test builds to randomly inject
// changes to unsafe keys retrieved from MVCCIterators.
type unsafeMVCCIterator struct {
        MVCCIterator
        keyBuf        []byte
        rawKeyBuf     []byte
        rawMVCCKeyBuf []byte
}

// gcassert:inline
func maybeWrapInUnsafeIter(iter MVCCIterator) MVCCIterator {
        if util.RaceEnabled {
                return &unsafeMVCCIterator{MVCCIterator: iter}
        }
        return iter
}

// gcassert:inline
func maybeUnwrapUnsafeIter(iter MVCCIterator) MVCCIterator {
        if util.RaceEnabled {
                if unsafeIter, ok := iter.(*unsafeMVCCIterator); ok {
                        return unsafeIter.MVCCIterator
                }
        }
        return iter
}

var _ MVCCIterator = &unsafeMVCCIterator{}

func (i *unsafeMVCCIterator) SeekGE(key MVCCKey) {
        i.mangleBufs()
        i.MVCCIterator.SeekGE(key)
}

func (i *unsafeMVCCIterator) Next() {
        i.mangleBufs()
        i.MVCCIterator.Next()
}

func (i *unsafeMVCCIterator) NextKey() {
        i.mangleBufs()
        i.MVCCIterator.NextKey()
}

func (i *unsafeMVCCIterator) SeekLT(key MVCCKey) {
        i.mangleBufs()
        i.MVCCIterator.SeekLT(key)
}

func (i *unsafeMVCCIterator) Prev() {
        i.mangleBufs()
        i.MVCCIterator.Prev()
}

func (i *unsafeMVCCIterator) UnsafeKey() MVCCKey {
        rv := i.MVCCIterator.UnsafeKey()
        i.keyBuf = append(i.keyBuf[:0], rv.Key...)
        rv.Key = i.keyBuf
        return rv
}

func (i *unsafeMVCCIterator) UnsafeRawKey() []byte {
        rv := i.MVCCIterator.UnsafeRawKey()
        i.rawKeyBuf = append(i.rawKeyBuf[:0], rv...)
        return i.rawKeyBuf
}

func (i *unsafeMVCCIterator) UnsafeRawMVCCKey() []byte {
        rv := i.MVCCIterator.UnsafeRawMVCCKey()
        i.rawMVCCKeyBuf = append(i.rawMVCCKeyBuf[:0], rv...)
        return i.rawMVCCKeyBuf
}

func (i *unsafeMVCCIterator) mangleBufs() {
        if rand.Intn(2) == 0 {
                for _, b := range [3][]byte{i.keyBuf, i.rawKeyBuf, i.rawMVCCKeyBuf} {
                        for i := range b {
                                b[i] = 0
                        }
                }
        }
}

// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"
        "sync"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/util/metamorphic"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/cockroach/pkg/util/uuid"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
)

// LockTableIterator is an EngineIterator that iterates over locks in the lock
// table keyspace. It performs no translation of input or output keys or values,
// so it is used like a normal EngineIterator, with the limitation that it can
// only be used to iterate over the lock table keyspace.
//
// The benefit of using a LockTableIterator is that it performs filtering of the
// locks in the lock table, only returning locks that match the configured
// filtering criteria and transparently skipping past locks that do not. The
// filtering criteria is expressed as a logical disjunction of two configuration
// parameters, at least one of which must be set:
//
//   - MatchTxnID: if set, the iterator return locks held by this transaction.
//
//   - MatchMinStr: if set, the iterator returns locks held by any transaction
//     with this strength or stronger.
//
// Expressed abstractly as a SQL query, the filtering criteria is:
//
//        SELECT * FROM lock_table WHERE (MatchTxnID  != 0 AND txn_id = MatchTxnID)
//                                    OR (MatchMinStr != 0 AND strength >= MatchMinStr)
//
// Pushing this filtering logic into the iterator is a convenience for its
// users. It also allows the iterator to use its knowledge of the lock table
// keyspace structure to efficiently skip past locks that do not match the
// filtering criteria. It does this by seeking past many ignored locks when
// appropriate to avoid cases of O(ignored_locks) work, instead performing at
// most O(matching_locks + locked_keys) work.
//
// A common case where this matters is with shared locks. If the iterator is
// configured to ignore shared locks, a single key with a large number of shared
// locks can be skipped over with a single seek. Avoiding this unnecessary work
// is essential to avoiding quadratic behavior during shared lock acquisition
// and release.
type LockTableIterator struct {
        iter   EngineIterator
        prefix bool
        // If set, return locks with any strength held by this transaction.
        matchTxnID uuid.UUID
        // If set, return locks held by any transaction with this strength or
        // stronger.
        matchMinStr lock.Strength
        // Used to avoid iterating over all shared locks on a key when not necessary,
        // given the filtering criteria. See the comment about "skip past locks" above
        // for details about why this is important.
        itersBeforeSeek lockTableItersBeforeSeekHelper
}

var _ EngineIterator = &LockTableIterator{}

// LockTableIteratorOptions contains options used to create a LockTableIterator.
type LockTableIteratorOptions struct {
        // See IterOptions.Prefix.
        Prefix bool
        // See IterOptions.LowerBound.
        LowerBound roachpb.Key
        // See IterOptions.UpperBound.
        UpperBound roachpb.Key

        // If set, return locks with any strength held by this transaction.
        MatchTxnID uuid.UUID
        // If set, return locks held by any transaction with this strength or
        // stronger.
        MatchMinStr lock.Strength
        // ReadCategory is used to map to a user-understandable category string, for
        // stats aggregation and metrics, and a Pebble-understandable QoS.
        ReadCategory fs.ReadCategory
}

// validate validates the LockTableIteratorOptions.
func (opts LockTableIteratorOptions) validate() error {
        if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 {
                return errors.AssertionFailedf("LockTableIterator must set prefix or upper bound or lower bound")
        }
        if len(opts.LowerBound) != 0 && !isLockTableKey(opts.LowerBound) {
                return errors.AssertionFailedf("LockTableIterator lower bound must be a lock table key")
        }
        if len(opts.UpperBound) != 0 && !isLockTableKey(opts.UpperBound) {
                return errors.AssertionFailedf("LockTableIterator upper bound must be a lock table key")
        }
        if opts.MatchTxnID == uuid.Nil && opts.MatchMinStr == 0 {
                return errors.AssertionFailedf("LockTableIterator must specify MatchTxnID, MatchMinStr, or both")
        }
        return nil
}

// toIterOptions converts the LockTableIteratorOptions to IterOptions.
func (opts LockTableIteratorOptions) toIterOptions() IterOptions {
        return IterOptions{
                Prefix:     opts.Prefix,
                LowerBound: opts.LowerBound,
                UpperBound: opts.UpperBound,
        }
}

var lockTableIteratorPool = sync.Pool{
        New: func() interface{} { return new(LockTableIterator) },
}

// NewLockTableIterator creates a new LockTableIterator.
func NewLockTableIterator(
        ctx context.Context, reader Reader, opts LockTableIteratorOptions,
) (*LockTableIterator, error) {
        if err := opts.validate(); err != nil {
                return nil, err
        }
        iter, err := reader.NewEngineIterator(ctx, opts.toIterOptions())
        if err != nil {
                return nil, err
        }
        ltIter := lockTableIteratorPool.Get().(*LockTableIterator)
        *ltIter = LockTableIterator{
                iter:            iter,
                prefix:          opts.Prefix,
                matchTxnID:      opts.MatchTxnID,
                matchMinStr:     opts.MatchMinStr,
                itersBeforeSeek: ltIter.itersBeforeSeek,
        }
        return ltIter, nil
}

// SeekEngineKeyGE implements the EngineIterator interface.
func (i *LockTableIterator) SeekEngineKeyGE(key EngineKey) (valid bool, err error) {
        if err := checkLockTableKey(key.Key); err != nil {
                return false, err
        }
        valid, err = i.iter.SeekEngineKeyGE(key)
        if !valid || err != nil {
                return valid, err
        }
        state, err := i.advanceToMatchingLock(+1, nil)
        return state == pebble.IterValid, err
}

// SeekEngineKeyLT implements the EngineIterator interface.
func (i *LockTableIterator) SeekEngineKeyLT(key EngineKey) (valid bool, err error) {
        if err := checkLockTableKey(key.Key); err != nil {
                return false, err
        }
        valid, err = i.iter.SeekEngineKeyLT(key)
        if !valid || err != nil {
                return valid, err
        }
        state, err := i.advanceToMatchingLock(-1, nil)
        return state == pebble.IterValid, err
}

// NextEngineKey implements the EngineIterator interface.
func (i *LockTableIterator) NextEngineKey() (valid bool, err error) {
        valid, err = i.iter.NextEngineKey()
        if !valid || err != nil {
                return valid, err
        }
        state, err := i.advanceToMatchingLock(+1, nil)
        return state == pebble.IterValid, err
}

// PrevEngineKey implements the EngineIterator interface.
func (i *LockTableIterator) PrevEngineKey() (valid bool, err error) {
        valid, err = i.iter.PrevEngineKey()
        if !valid || err != nil {
                return valid, err
        }
        state, err := i.advanceToMatchingLock(-1, nil)
        return state == pebble.IterValid, err
}

// SeekEngineKeyGEWithLimit implements the EngineIterator interface.
func (i *LockTableIterator) SeekEngineKeyGEWithLimit(
        key EngineKey, limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
        if err := checkLockTableKey(key.Key); err != nil {
                return 0, err
        }
        if err := checkLockTableKeyOrNil(limit); err != nil {
                return 0, err
        }
        state, err = i.iter.SeekEngineKeyGEWithLimit(key, limit)
        if state != pebble.IterValid || err != nil {
                return state, err
        }
        return i.advanceToMatchingLock(+1, limit)
}

// SeekEngineKeyLTWithLimit implements the EngineIterator interface.
func (i *LockTableIterator) SeekEngineKeyLTWithLimit(
        key EngineKey, limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
        if err := checkLockTableKey(key.Key); err != nil {
                return 0, err
        }
        if err := checkLockTableKeyOrNil(limit); err != nil {
                return 0, err
        }
        state, err = i.iter.SeekEngineKeyLTWithLimit(key, limit)
        if state != pebble.IterValid || err != nil {
                return state, err
        }
        return i.advanceToMatchingLock(-1, limit)
}

// NextEngineKeyWithLimit implements the EngineIterator interface.
func (i *LockTableIterator) NextEngineKeyWithLimit(
        limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
        if err := checkLockTableKeyOrNil(limit); err != nil {
                return 0, err
        }
        state, err = i.iter.NextEngineKeyWithLimit(limit)
        if state != pebble.IterValid || err != nil {
                return state, err
        }
        return i.advanceToMatchingLock(+1, limit)
}

// PrevEngineKeyWithLimit implements the EngineIterator interface.
func (i *LockTableIterator) PrevEngineKeyWithLimit(
        limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
        if err := checkLockTableKeyOrNil(limit); err != nil {
                return 0, err
        }
        state, err = i.iter.PrevEngineKeyWithLimit(limit)
        if state != pebble.IterValid || err != nil {
                return state, err
        }
        return i.advanceToMatchingLock(-1, limit)
}

// advanceToMatchingLock advances the iterator to the next lock table key that
// matches the configured filtering criteria. If limit is non-nil, the iterator
// will stop advancing once it reaches the limit.
func (i *LockTableIterator) advanceToMatchingLock(
        dir int, limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
        defer i.itersBeforeSeek.reset()
        for {
                engineKey, err := i.iter.UnsafeEngineKey()
                if err != nil {
                        return 0, err
                }
                str, txnID, err := engineKey.decodeLockTableKeyVersion()
                if err != nil {
                        return 0, err
                }
                if i.matchingLock(str, txnID) {
                        return pebble.IterValid, nil
                }

                // We found a non-matching lock. Determine whether to step or seek past it.
                // We only ever seek if we found a shared lock, because no other locking
                // strength allows for multiple locks to be held by different transactions
                // on the same key.
                var seek bool
                if str == lock.Shared {
                        seek = i.itersBeforeSeek.shouldSeek(engineKey.Key)
                }

                // Advance to the next key, either by stepping or seeking.
                if seek {
                        ltKey, ltKeyErr := engineKey.ToLockTableKey()
                        if ltKeyErr != nil {
                                return 0, ltKeyErr
                        }
                        seekKeyBuf := &i.itersBeforeSeek.seekKeyBuf
                        var seekKey EngineKey
                        if dir < 0 {
                                // If iterating backwards and searching for locks held by a specific
                                // transaction, determine whether we have yet to reach key/shared/txnID
                                // or have already passed it. If we have not yet passed it, seek to the
                                // specific version, remembering to offset the txn ID by 1 to account
                                // for the exclusive reverse seek. Otherwise, seek past the maximum
                                // (first) txn ID to the previous locking strength (exclusive).
                                // NOTE: Recall that txnIDs in the lock table key version are ordered in
                                // reverse lexicographical order.
                                if i.matchTxnID != uuid.Nil && bytes.Compare(txnID.GetBytes(), i.matchTxnID.GetBytes()) < 0 {
                                        // The subtraction cannot underflow because matchTxnID cannot be the
                                        // zero UUID if we are in this branch, with the iterator positioned
                                        // after the matchTxnID. Assert for good measure.
                                        if i.matchTxnID == uuid.Nil {
                                                panic("matchTxnID is unexpectedly the zero UUID")
                                        }
                                        ltKey.TxnUUID = uuid.FromUint128(i.matchTxnID.ToUint128().Sub(1))
                                        seekKey, *seekKeyBuf = ltKey.ToEngineKey(*seekKeyBuf)
                                } else {
                                        ltKey.TxnUUID = uuid.Max
                                        seekKey, *seekKeyBuf = ltKey.ToEngineKey(*seekKeyBuf)
                                }
                                state, err = i.iter.SeekEngineKeyLTWithLimit(seekKey, limit)
                        } else {
                                // If iterating forwards and searching for locks held by a specific
                                // transaction, determine whether we have yet to reach /key/shared/txnID
                                // or have already passed it. If we have not yet passed it, seek to the
                                // specific version. Otherwise, seek to the next key prefix.
                                // NOTE: Recall that txnIDs in the lock table key version are ordered in
                                // reverse lexicographical order.
                                // NOTE: Recall that shared locks are ordered last for a given key.
                                if i.matchTxnID != uuid.Nil && bytes.Compare(txnID.GetBytes(), i.matchTxnID.GetBytes()) > 0 {
                                        ltKey.TxnUUID = i.matchTxnID
                                        seekKey, *seekKeyBuf = ltKey.ToEngineKey(*seekKeyBuf)
                                } else {
                                        // Seek to the next key prefix (locks on the next user key).
                                        // Unlike the two reverse iteration cases and the forward
                                        // iteration case where we have yet to reach /key/shared/txnID,
                                        // this case deserves special consideration.
                                        if i.prefix {
                                                // If we are configured as a prefix iterator, do not seek to
                                                // the next key prefix. Instead, return the IterExhausted
                                                // state. This is more than just an optimization. Seeking to
                                                // the next key prefix would move the underlying iterator
                                                // (which is also configured for prefix iteration) to the
                                                // next key prefix, if such a key prefix exists.
                                                //
                                                // This case could be decoupled from the itersBeforeSeek
                                                // optimization. When performing prefix iteration, we could
                                                // immediately detect cases where there are no more possible
                                                // matching locks in the key prefix and return an exhausted
                                                // state, instead of waiting until we decide to seek to do
                                                // so. It's not clear that this additional complexity and
                                                // code duplication is worth it, so we don't do it for now.
                                                return pebble.IterExhausted, nil
                                        }
                                        // TODO(nvanbenschoten): for now, we call SeekEngineKeyGEWithLimit
                                        // with the prefix of the next lock table key. If EngineIterator
                                        // exposed an interface that called NextPrefix(), we could use that
                                        // instead. This will require adding a NextPrefixWithLimit() method
                                        // to pebble.
                                        var seekKeyPrefix roachpb.Key
                                        seekKeyPrefix, *seekKeyBuf = keys.LockTableSingleNextKey(ltKey.Key, *seekKeyBuf)
                                        seekKey = EngineKey{Key: seekKeyPrefix}
                                }
                                state, err = i.iter.SeekEngineKeyGEWithLimit(seekKey, limit)
                        }
                } else {
                        if dir < 0 {
                                state, err = i.iter.PrevEngineKeyWithLimit(limit)
                        } else {
                                state, err = i.iter.NextEngineKeyWithLimit(limit)
                        }
                }
                if state != pebble.IterValid || err != nil {
                        return state, err
                }
        }
}

// matchingLock returns whether the lock table key with the provided strength
// and transaction ID matches the configured filtering criteria.
func (i *LockTableIterator) matchingLock(str lock.Strength, txnID uuid.UUID) bool {
        // Is this a lock held by the desired transaction?
        return (i.matchTxnID != uuid.Nil && i.matchTxnID == txnID) ||
                // Or, is this a lock with the desired strength or stronger?
                (i.matchMinStr != 0 && i.matchMinStr <= str)
}

// Close implements the EngineIterator interface.
func (i *LockTableIterator) Close() {
        i.iter.Close()
        *i = LockTableIterator{
                itersBeforeSeek: i.itersBeforeSeek,
        }
        lockTableIteratorPool.Put(i)
}

// HasPointAndRange implements the EngineIterator interface.
func (i *LockTableIterator) HasPointAndRange() (bool, bool) {
        return i.iter.HasPointAndRange()
}

// EngineRangeBounds implements the EngineIterator interface.
func (i *LockTableIterator) EngineRangeBounds() (roachpb.Span, error) {
        return i.iter.EngineRangeBounds()
}

// EngineRangeKeys implements the EngineIterator interface.
func (i *LockTableIterator) EngineRangeKeys() []EngineRangeKeyValue {
        return i.iter.EngineRangeKeys()
}

// RangeKeyChanged implements the EngineIterator interface.
func (i *LockTableIterator) RangeKeyChanged() bool {
        return i.iter.RangeKeyChanged()
}

// UnsafeEngineKey implements the EngineIterator interface.
func (i *LockTableIterator) UnsafeEngineKey() (EngineKey, error) {
        return i.iter.UnsafeEngineKey()
}

// EngineKey implements the EngineIterator interface.
func (i *LockTableIterator) EngineKey() (EngineKey, error) {
        return i.iter.EngineKey()
}

// UnsafeRawEngineKey implements the EngineIterator interface.
func (i *LockTableIterator) UnsafeRawEngineKey() []byte {
        return i.iter.UnsafeRawEngineKey()
}

// UnsafeLockTableKey returns the current key as an unsafe LockTableKey.
// TODO(nvanbenschoten): use this more widely.
func (i *LockTableIterator) UnsafeLockTableKey() (LockTableKey, error) {
        k, err := i.iter.UnsafeEngineKey()
        if err != nil {
                return LockTableKey{}, errors.Wrap(err, "retrieving lock table key")
        }
        return k.ToLockTableKey()
}

// LockTableKeyVersion returns the strength and txn ID from the version of the
// current key.
func (i *LockTableIterator) LockTableKeyVersion() (lock.Strength, uuid.UUID, error) {
        k, err := i.iter.UnsafeEngineKey()
        if err != nil {
                return 0, uuid.UUID{}, errors.Wrap(err, "retrieving lock table key")
        }
        return k.decodeLockTableKeyVersion()
}

// UnsafeValue implements the EngineIterator interface.
func (i *LockTableIterator) UnsafeValue() ([]byte, error) {
        return i.iter.UnsafeValue()
}

// UnsafeLazyValue implements the EngineIterator interface.
func (i *LockTableIterator) UnsafeLazyValue() pebble.LazyValue {
        return i.iter.(*pebbleIterator).UnsafeLazyValue()
}

// Value implements the EngineIterator interface.
func (i *LockTableIterator) Value() ([]byte, error) {
        return i.iter.Value()
}

// ValueLen implements the EngineIterator interface.
func (i *LockTableIterator) ValueLen() int {
        return i.iter.ValueLen()
}

// ValueProto unmarshals the current value into the provided proto.
func (i *LockTableIterator) ValueProto(meta *enginepb.MVCCMetadata) error {
        v, err := i.iter.UnsafeValue()
        if err != nil {
                return errors.Wrap(err, "retrieving lock table value")
        }
        return protoutil.Unmarshal(v, meta)
}

// CloneContext implements the EngineIterator interface.
func (i *LockTableIterator) CloneContext() CloneContext {
        return i.iter.CloneContext()
}

// Stats implements the EngineIterator interface.
func (i *LockTableIterator) Stats() IteratorStats {
        return i.iter.Stats()
}

//gcassert:inline
func isLockTableKey(key roachpb.Key) bool {
        return bytes.HasPrefix(key, keys.LocalRangeLockTablePrefix)
}

var errNotLockTableKey = errors.New("LockTableIterator: key is not a lock table key")

//gcassert:inline
func checkLockTableKey(key roachpb.Key) error {
        if !isLockTableKey(key) {
                return errNotLockTableKey
        }
        return nil
}

//gcassert:inline
func checkLockTableKeyOrNil(key roachpb.Key) error {
        if len(key) == 0 {
                return nil
        }
        return checkLockTableKey(key)
}

// defaultLockTableItersBeforeSeek is the default value for the
// lockTableItersBeforeSeek metamorphic value.
const defaultLockTableItersBeforeSeek = 5

// lockTableItersBeforeSeek is the number of iterations to perform across the
// shared locks on a single user key before seeking past them. This is used to
// avoid iterating over all shared locks on a key when not necessary, given the
// filtering criteria.
var lockTableItersBeforeSeek = metamorphic.ConstantWithTestRange(
        "lock-table-iters-before-seek",
        defaultLockTableItersBeforeSeek, /* defaultValue */
        0,                               /* min */
        3,                               /* max */
)

// DisableMetamorphicLockTableItersBeforeSeek disables the metamorphic value for
// the duration of a test, resetting it at the end.
func DisableMetamorphicLockTableItersBeforeSeek(t interface {
        Helper()
        Cleanup(func())
}) {
        t.Helper()
        prev := lockTableItersBeforeSeek
        lockTableItersBeforeSeek = defaultLockTableItersBeforeSeek
        t.Cleanup(func() {
                lockTableItersBeforeSeek = prev
        })
}

// lockTableItersBeforeSeekHelper is a helper struct that keeps track of the
// number of iterations performed across the shared locks on a single user key
// while searching for matching locks in the lock table. It is used to determine
// when to seek past the shared locks to avoid O(ignored_locks) work.
//
// This is similar to the dynamic itersBeforeSeek algorithm that is used by
// pebbleMVCCScanner when scanning over mvcc versions for a key. However, we
// don't adaptively adjust the number of itersBeforeSeek as we go. Instead, we
// reset the iteration counter to lockTableItersBeforeSeek (default: 5) on each
// new key prefix. Doing something more sophisticated introduces complexity and
// it's not clear that this is worth it.
//
// The zero value is ready to use.
type lockTableItersBeforeSeekHelper struct {
        curItersBeforeSeek int
        curKeyPrefix       roachpb.Key

        // Buffers that avoids allocations.
        keyPrefixBuf []byte
        seekKeyBuf   []byte
}

func (h *lockTableItersBeforeSeekHelper) reset() {
        // Clearing the curKeyPrefix ensures that the next call to shouldSeek() will
        // save the new key prefix and reset curItersBeforeSeek. This is why the zero
        // value of the struct is ready to use.
        h.curKeyPrefix = nil
}

func (h *lockTableItersBeforeSeekHelper) shouldSeek(keyPrefix roachpb.Key) bool {
        if h.alwaysSeek() {
                return true
        }
        if !h.curKeyPrefix.Equal(keyPrefix) {
                // New key prefix (or curKeyPrefix was nil). Save it and reset the iteration
                // count.
                h.saveKeyPrefix(keyPrefix)
                h.curItersBeforeSeek = lockTableItersBeforeSeek
        } else {
                // Same key prefix as before. Check if we should seek.
                if h.curItersBeforeSeek == 0 {
                        return true
                }
        }
        h.curItersBeforeSeek--
        return false
}

func (h *lockTableItersBeforeSeekHelper) alwaysSeek() bool {
        // Only returns true in tests when the metamorphic value is set to 0.
        return lockTableItersBeforeSeek == 0
}

func (h *lockTableItersBeforeSeekHelper) saveKeyPrefix(keyPrefix roachpb.Key) {
        h.keyPrefixBuf = append(h.keyPrefixBuf[:0], keyPrefix...)
        h.curKeyPrefix = h.keyPrefixBuf
}

// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"
        "sync"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvpb"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/util/uuid"
        "github.com/cockroachdb/errors"
)

// Fixed length slice for all supported lock strengths for replicated locks. May
// be used to iterate supported lock strengths in strength order (strongest to
// weakest).
var replicatedLockStrengths = [...]lock.Strength{lock.Intent, lock.Exclusive, lock.Shared}

func init() {
        if replicatedLockStrengths[0] != lock.MaxStrength {
                panic("replicatedLockStrengths[0] != lock.MaxStrength; update replicatedLockStrengths?")
        }
}

// replicatedLockStrengthToIndexMap returns a mapping between (strength, index)
// pairs that can be used to index into the lockTableScanner.ownLocks array.
//
// Trying to use a lock strength that isn't supported with replicated locks to
// index into the lockTableScanner.ownLocks array will cause a runtime error.
var replicatedLockStrengthToIndexMap = func() (m [lock.MaxStrength + 1]int) {
        // Initialize all to -1.
        for str := range m {
                m[str] = -1
        }
        // Set the indices of the valid strengths.
        for i, str := range replicatedLockStrengths {
                m[str] = i
        }
        return m
}()

// strongerOrEqualStrengths returns all supported lock strengths for replicated
// locks that are as strong or stronger than the provided strength. The returned
// slice is ordered from strongest to weakest.
func strongerOrEqualStrengths(str lock.Strength) []lock.Strength {
        return replicatedLockStrengths[:replicatedLockStrengthToIndexMap[str]+1]
}

// minConflictLockStrength returns the minimum lock strength that conflicts with
// the provided lock strength.
func minConflictLockStrength(str lock.Strength) (lock.Strength, error) {
        switch str {
        case lock.None:
                // Don't conflict with any locks held by other transactions.
                return lock.None, nil
        case lock.Shared:
                return lock.Exclusive, nil
        case lock.Exclusive, lock.Intent:
                return lock.Shared, nil
        default:
                return 0, errors.AssertionFailedf(
                        "lockTableKeyScanner: unexpected lock strength %s", str.String())
        }
}

// lockTableKeyScanner is used to scan a single key in the replicated lock
// table. It searches for locks on the key that conflict with a (transaction,
// lock strength) pair and for locks that the transaction has already acquired
// on the key.
//
// The purpose of a lockTableKeyScanner is to determine whether a transaction
// can acquire a lock on a key or perform an MVCC mutation on a key, and if so,
// what lock table keys the transaction should write to perform the operation.
type lockTableKeyScanner struct {
        iter *LockTableIterator
        // The transaction attempting to acquire a lock. The ID will be zero if a
        // non-transactional request is attempting to perform an MVCC mutation.
        txnID uuid.UUID
        // Stop adding conflicting locks and abort scan once the maxConflicts limit
        // is reached. Ignored if zero.
        maxConflicts int64
        // Stop adding conflicting locks and abort scan once the targetBytesPerConflict
        // limit is reached via collected intent size. Ignored if zero.
        targetBytesPerConflict int64

        // Stores any error returned. If non-nil, iteration short circuits.
        err error
        // Stores any locks that conflict with the transaction and locking strength.
        conflicts []roachpb.Lock
        // Stores the total byte size of conflicts.
        conflictBytes int64
        // Stores any locks that the transaction has already acquired.
        ownLocks [len(replicatedLockStrengths)]*enginepb.MVCCMetadata

        // Avoids heap allocations.
        ltKeyBuf     []byte
        ltValue      enginepb.MVCCMetadata
        firstOwnLock enginepb.MVCCMetadata
}

var lockTableKeyScannerPool = sync.Pool{
        New: func() interface{} { return new(lockTableKeyScanner) },
}

// newLockTableKeyScanner creates a new lockTableKeyScanner.
//
// txnID corresponds to the ID of the transaction attempting to acquire locks.
// If txnID is valid (non-empty), locks held by the transaction with any
// strength will be accumulated into the ownLocks array. Otherwise, if txnID is
// empty, the request is non-transactional and no locks will be accumulated into
// the ownLocks array.
//
// str is the strength of the lock that the transaction (or non-transactional
// request) is attempting to acquire. The scanner will search for locks held by
// other transactions that conflict with this strength[1].
//
// maxConflicts is the maximum number of conflicting locks that the scanner
// should accumulate before returning an error. If maxConflicts is zero, the
// scanner will accumulate all conflicting locks.
//
// [1] It's valid to pass in lock.None for str. lock.None doesn't conflict with
// any other replicated locks; as such, passing lock.None configures the scanner
// to only return locks from the supplied txnID.
func newLockTableKeyScanner(
        ctx context.Context,
        reader Reader,
        txnID uuid.UUID,
        str lock.Strength,
        maxConflicts int64,
        targetBytesPerConflict int64,
        readCategory fs.ReadCategory,
) (*lockTableKeyScanner, error) {
        minConflictStr, err := minConflictLockStrength(str)
        if err != nil {
                return nil, err
        }
        iter, err := NewLockTableIterator(ctx, reader, LockTableIteratorOptions{
                Prefix:       true,
                MatchTxnID:   txnID,
                MatchMinStr:  minConflictStr,
                ReadCategory: readCategory,
        })
        if err != nil {
                return nil, err
        }
        s := lockTableKeyScannerPool.Get().(*lockTableKeyScanner)
        s.iter = iter
        s.txnID = txnID
        s.maxConflicts = maxConflicts
        s.targetBytesPerConflict = targetBytesPerConflict
        return s, nil
}

func (s *lockTableKeyScanner) close() {
        s.iter.Close()
        *s = lockTableKeyScanner{ltKeyBuf: s.ltKeyBuf}
        lockTableKeyScannerPool.Put(s)
}

// scan scans the lock table at the provided key for locks held by other
// transactions that conflict with the configured locking strength and for locks
// of any strength that the configured transaction has already acquired.
func (s *lockTableKeyScanner) scan(key roachpb.Key) error {
        s.resetScanState()
        for ok := s.seek(key); ok; ok = s.getOneAndAdvance() {
        }
        return s.afterScan()
}

// resetScanState resets the scanner's state before a scan.
func (s *lockTableKeyScanner) resetScanState() {
        s.err = nil
        s.conflicts = nil
        s.conflictBytes = 0
        for i := range s.ownLocks {
                s.ownLocks[i] = nil
        }
        s.ltValue.Reset()
        s.firstOwnLock.Reset()
}

// afterScan returns any error encountered during the scan.
func (s *lockTableKeyScanner) afterScan() error {
        if s.err != nil {
                return s.err
        }
        if len(s.conflicts) != 0 {
                return &kvpb.LockConflictError{Locks: s.conflicts}
        }
        return nil
}

// seek seeks the iterator to the first lock table key associated with the
// provided key. Returns true if the scanner should continue scanning, false
// if not.
func (s *lockTableKeyScanner) seek(key roachpb.Key) bool {
        var ltKey roachpb.Key
        ltKey, s.ltKeyBuf = keys.LockTableSingleKey(key, s.ltKeyBuf)
        valid, err := s.iter.SeekEngineKeyGE(EngineKey{Key: ltKey})
        if err != nil {
                s.err = err
        }
        return valid
}

// getOneAndAdvance consumes the current lock table key and value and advances
// the iterator. Returns true if the scanner should continue scanning, false if
// not.
func (s *lockTableKeyScanner) getOneAndAdvance() bool {
        ltKey, ok := s.getLockTableKey()
        if !ok {
                return false
        }
        ltValue, ok := s.getLockTableValue()
        if !ok {
                return false
        }
        if !s.consumeLockTableKeyValue(ltKey, ltValue) {
                return false
        }
        return s.advance()
}

// advance advances the iterator to the next lock table key.
func (s *lockTableKeyScanner) advance() bool {
        valid, err := s.iter.NextEngineKey()
        if err != nil {
                s.err = err
        }
        return valid
}

// getLockTableKey decodes the current lock table key.
func (s *lockTableKeyScanner) getLockTableKey() (LockTableKey, bool) {
        ltEngKey, err := s.iter.UnsafeEngineKey()
        if err != nil {
                s.err = err
                return LockTableKey{}, false
        }
        ltKey, err := ltEngKey.ToLockTableKey()
        if err != nil {
                s.err = err
                return LockTableKey{}, false
        }
        return ltKey, true
}

// getLockTableValue decodes the current lock table values.
func (s *lockTableKeyScanner) getLockTableValue() (*enginepb.MVCCMetadata, bool) {
        err := s.iter.ValueProto(&s.ltValue)
        if err != nil {
                s.err = err
                return nil, false
        }
        return &s.ltValue, true
}

// consumeLockTableKeyValue consumes the current lock table key and value, which
// is either a conflicting lock or a lock held by the scanning transaction.
func (s *lockTableKeyScanner) consumeLockTableKeyValue(
        ltKey LockTableKey, ltValue *enginepb.MVCCMetadata,
) bool {
        if ltValue.Txn == nil {
                s.err = errors.AssertionFailedf("unexpectedly found non-transactional lock: %v", ltValue)
                return false
        }
        if ltKey.TxnUUID != ltValue.Txn.ID {
                s.err = errors.AssertionFailedf("lock table key (%+v) and value (%+v) txn ID mismatch", ltKey, ltValue)
                return false
        }
        if ltKey.TxnUUID == s.txnID {
                return s.consumeOwnLock(ltKey, ltValue)
        }
        return s.consumeConflictingLock(ltKey, ltValue)
}

// consumeOwnLock consumes a lock held by the scanning transaction.
func (s *lockTableKeyScanner) consumeOwnLock(
        ltKey LockTableKey, ltValue *enginepb.MVCCMetadata,
) bool {
        var ltValueCopy *enginepb.MVCCMetadata
        if s.firstOwnLock.Txn == nil {
                // This is the first lock held by the transaction that we've seen, so
                // we can avoid the heap allocation.
                ltValueCopy = &s.firstOwnLock
        } else {
                ltValueCopy = new(enginepb.MVCCMetadata)
        }
        // NOTE: this will alias internal pointer fields of ltValueCopy with those
        // in ltValue, but this will not lead to issues when ltValue is updated by
        // the next call to getLockTableValue, because its internal fields will be
        // reset by protoutil.Unmarshal before unmarshalling.
        *ltValueCopy = *ltValue
        s.ownLocks[replicatedLockStrengthToIndexMap[ltKey.Strength]] = ltValueCopy
        return true
}

// consumeConflictingLock consumes a conflicting lock.
func (s *lockTableKeyScanner) consumeConflictingLock(
        ltKey LockTableKey, ltValue *enginepb.MVCCMetadata,
) bool {
        conflict := roachpb.MakeLock(ltValue.Txn, ltKey.Key.Clone(), ltKey.Strength)
        conflictSize := int64(conflict.Size())
        s.conflictBytes += conflictSize
        s.conflicts = append(s.conflicts, conflict)
        if s.maxConflicts != 0 && s.maxConflicts == int64(len(s.conflicts)) {
                return false
        }
        if s.targetBytesPerConflict != 0 && s.conflictBytes >= s.targetBytesPerConflict {
                return false
        }
        return true
}

// foundOwn returns the lock table value for the provided strength if the
// transaction has already acquired a lock of that strength. Returns nil if not.
func (s *lockTableKeyScanner) foundOwn(str lock.Strength) *enginepb.MVCCMetadata {
        return s.ownLocks[replicatedLockStrengthToIndexMap[str]]
}

// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "io"

        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/errors/oserror"
        "github.com/cockroachdb/pebble/vfs"
)

// MinVersionFilename is the name of the file containing a marshaled
// roachpb.Version that can be updated during storage-related migrations
// and checked on startup to determine if we can safely use a
// backwards-incompatible feature.
const MinVersionFilename = "STORAGE_MIN_VERSION"

// writeMinVersionFile writes the provided version to disk. The caller must
// guarantee that the version will never be downgraded below the given version.
func writeMinVersionFile(atomicRenameFS vfs.FS, dir string, version roachpb.Version) error {
        // TODO(jackson): Assert that atomicRenameFS supports atomic renames
        // once Pebble is bumped to the appropriate SHA.
        if version == (roachpb.Version{}) {
                return errors.New("min version should not be empty")
        }
        ok, err := MinVersionIsAtLeastTargetVersion(atomicRenameFS, dir, version)
        if err != nil {
                return err
        }
        if ok {
                return nil
        }
        b, err := protoutil.Marshal(&version)
        if err != nil {
                return err
        }
        filename := atomicRenameFS.PathJoin(dir, MinVersionFilename)
        if err := fs.SafeWriteToFile(atomicRenameFS, dir, filename, b, fs.UnspecifiedWriteCategory); err != nil {
                return err
        }
        return nil
}

// MinVersionIsAtLeastTargetVersion returns whether the min version recorded
// on disk is at least the target version.
func MinVersionIsAtLeastTargetVersion(
        atomicRenameFS vfs.FS, dir string, target roachpb.Version,
) (bool, error) {
        // TODO(jackson): Assert that atomicRenameFS supports atomic renames
        // once Pebble is bumped to the appropriate SHA.
        if target == (roachpb.Version{}) {
                return false, errors.New("target version should not be empty")
        }
        minVersion, ok, err := getMinVersion(atomicRenameFS, dir)
        if err != nil {
                return false, err
        }
        if !ok {
                return false, nil
        }
        return !minVersion.Less(target), nil
}

// getMinVersion returns the min version recorded on disk. If the min version
// file doesn't exist, returns ok=false.
func getMinVersion(atomicRenameFS vfs.FS, dir string) (_ roachpb.Version, ok bool, _ error) {
        // TODO(jackson): Assert that atomicRenameFS supports atomic renames
        // once Pebble is bumped to the appropriate SHA.

        filename := atomicRenameFS.PathJoin(dir, MinVersionFilename)
        f, err := atomicRenameFS.Open(filename)
        if oserror.IsNotExist(err) {
                return roachpb.Version{}, false, nil
        }
        if err != nil {
                return roachpb.Version{}, false, err
        }
        defer f.Close()
        b, err := io.ReadAll(f)
        if err != nil {
                return roachpb.Version{}, false, err
        }
        version := roachpb.Version{}
        if err := protoutil.Unmarshal(b, &version); err != nil {
                return roachpb.Version{}, false, err
        }
        return version, true, nil
}

// Copyright 2015 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"
        "fmt"
        "hash/fnv"
        "io"
        "math"
        "runtime"
        "sort"
        "sync"
        "time"

        "github.com/cockroachdb/cockroach/pkg/build"
        "github.com/cockroachdb/cockroach/pkg/col/coldata"
        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvnemesis/kvnemesisutil"
        "github.com/cockroachdb/cockroach/pkg/kv/kvpb"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/uncertainty"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/util"
        "github.com/cockroachdb/cockroach/pkg/util/admission"
        "github.com/cockroachdb/cockroach/pkg/util/bufalloc"
        "github.com/cockroachdb/cockroach/pkg/util/buildutil"
        "github.com/cockroachdb/cockroach/pkg/util/envutil"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/iterutil"
        "github.com/cockroachdb/cockroach/pkg/util/log"
        "github.com/cockroachdb/cockroach/pkg/util/mon"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/cockroach/pkg/util/timeutil"
        "github.com/cockroachdb/cockroach/pkg/util/tracing"
        "github.com/cockroachdb/cockroach/pkg/util/uuid"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
)

const (
        // MVCCVersionTimestampSize is the size of the timestamp portion of MVCC
        // version keys (used to update stats).
        MVCCVersionTimestampSize int64 = 12
        // RecommendedMaxOpenFiles is the recommended value for RocksDB's
        // max_open_files option.
        RecommendedMaxOpenFiles = 10000
        // MinimumMaxOpenFiles is the minimum value that RocksDB's max_open_files
        // option can be set to. While this should be set as high as possible, the
        // minimum total for a single store node must be under 2048 for Windows
        // compatibility.
        MinimumMaxOpenFiles = 1700
        // MaxConflictsPerLockConflictErrorDefault is the default value for maximum
        // number of locks reported by ExportToSST and Scan operations in
        // LockConflictError is set to half of the maximum lock table size. This value
        // is subject to tuning in real environment as we have more data available.
        MaxConflictsPerLockConflictErrorDefault = 5000
        // TargetBytesPerLockConflictErrorDefault is the default value for maximum
        // size of locks reported by ExportToSST and Scan operations in
        // LockConflictError. This value
        // is subject to tuning in real environment as we have more data available.
        TargetBytesPerLockConflictErrorDefault = 8388608
)

var minWALSyncInterval = settings.RegisterDurationSetting(
        settings.ApplicationLevel,
        "rocksdb.min_wal_sync_interval",
        "minimum duration between syncs of the RocksDB WAL",
        0*time.Millisecond,
        settings.NonNegativeDurationWithMaximum(1*time.Second),
)

// MaxConflictsPerLockConflictError sets maximum number of locks returned in
// LockConflictError in operations that return multiple locks per error.
var MaxConflictsPerLockConflictError = settings.RegisterIntSetting(
        settings.ApplicationLevel,
        "storage.mvcc.max_intents_per_error",
        "maximum number of locks returned in errors during evaluation",
        MaxConflictsPerLockConflictErrorDefault,
        settings.WithName("storage.mvcc.max_conflicts_per_lock_conflict_error"),
)

// TargetBytesPerLockConflictError sets target bytes for collected intents with
// LockConflictError. This setting will stop collecting intents when total intent
// size exceeding the target threshold.
var TargetBytesPerLockConflictError = settings.RegisterIntSetting(
        settings.ApplicationLevel,
        "storage.mvcc.target_intent_bytes_per_error",
        "maximum total lock size returned in errors during evaluation",
        TargetBytesPerLockConflictErrorDefault,
        settings.WithName("storage.mvcc.target_bytes_per_lock_conflict_error"),
)

// getMaxConcurrentCompactions wraps the maxConcurrentCompactions env var in a
// func that may be installed on Options.MaxConcurrentCompactions. It also
// imposes a floor on the max, so that an engine is always created with at least
// 1 slot for a compactions.
//
// NB: This function inspects the environment every time it's called. This is
// okay, because Engine construction in NewPebble will invoke it and store the
// value on the Engine itself.
func getMaxConcurrentCompactions() int {
        n := envutil.EnvOrDefaultInt(
                "COCKROACH_CONCURRENT_COMPACTIONS", func() int {
                        // The old COCKROACH_ROCKSDB_CONCURRENCY environment variable was never
                        // documented, but customers were told about it and use today in
                        // production. We don't want to break them, so if the new env var
                        // is unset but COCKROACH_ROCKSDB_CONCURRENCY is set, use the old env
                        // var's value. This old env var has a wart in that it's expressed as a
                        // number of concurrency slots to make available to both flushes and
                        // compactions (a vestige of the corresponding RocksDB option's
                        // mechanics). We need to adjust it to be in terms of just compaction
                        // concurrency by subtracting the flushing routine's dedicated slot.
                        //
                        // TODO(jackson): Should envutil expose its `getEnv` internal func for
                        // cases like this where we actually want to know whether it's present
                        // or not; not just fallback to a default?
                        if oldV := envutil.EnvOrDefaultInt("COCKROACH_ROCKSDB_CONCURRENCY", 0); oldV > 0 {
                                return oldV - 1
                        }

                        // By default use up to min(numCPU-1, 3) threads for background
                        // compactions per store (reserving the final process for flushes).
                        const max = 3
                        if n := runtime.GOMAXPROCS(0); n-1 < max {
                                return n - 1
                        }
                        return max
                }())
        if n < 1 {
                return 1
        }
        return n
}

// l0SubLevelCompactionConcurrency is the sub-level threshold at which to
// allow an increase in compaction concurrency. The maximum is still
// controlled by pebble.Options.MaxConcurrentCompactions. The default of 2
// allows an additional compaction (so total 1 + 1 = 2 compactions) when the
// sub-level count is 2, and increments concurrency by 1 whenever sub-level
// count increases by 2 (so 1 + 2 = 3 compactions) when sub-level count is 4,
// and so on, i.e., floor(1 + l/2), where l is the number of sub-levels. See
// the logic in
// https://github.com/cockroachdb/pebble/blob/86593692e09f904f4ea739e065074f44f40ec9ba/compaction_picker.go#L1204-L1220.
//
// We abbreviate l0SubLevelCompactionConcurrency to lslcc below. And all the
// discussion below is in units of compaction concurrency. Let l represent the
// current sub-level count. MaxConcurrentCompactions is a constant and not a
// function of l. The upper bound on concurrent compactions, that we computed
// above, is represented as upper-bound-cc(lslcc, l), since it is a function
// of both lslcc and l. The formula is:
//
// upper-bound-cc(lslcc, l) = floor(1 + l/lslcc)
//
// where in the example above lslcc=2.
//
// A visual representation (where lslcc is fixed) is shown below, where the x
// axis is the current number of sub-levels and the y axis is in units of
// compaction concurrency (cc).
//
//           ^               +  upper-bound-cc
//           |            +
//        cc |---------+------------- MaxConcurrentCompactions
//           |      +
//           |   +
//           |+
//           |
//           ------------------------->
//              l
//
// Where the permitted concurrent compactions is the minimum across the two
// curves shown above.
//
//           ^
//           |
//        cc |         **********     permitted concurrent compactions
//           |      *
//           |   *
//           |*
//           |
//           ------------------------->
//                l
//
// Next we discuss the interaction with admission control, which is where care
// is needed. Admission control (see admission.ioLoadListener) gives out
// tokens that shape the incoming traffic. The tokens are a function of l and
// the measured compaction bandwidth out of L0. But the measured compaction
// bandwidth is itself a function of the tokens and upper-bound-cc(lslcc,l)
// and MaxConcurrentCompactions. To tease apart this interaction, we note that
// when l increases and AC starts shaping incoming traffic, it initially gives
// out tokens that are higher than the measured compaction bandwidth. That is,
// it over-admits. So if Pebble has the ability to increase compaction
// bandwidth to handle this over-admission, it has the opportunity to do so,
// and the increased compaction bandwidth will feed back into even more
// tokens, and this will repeat until Pebble has no ability to further
// increase the compaction bandwidth. This simple analysis suffices when
// upper-bound-cc(lslcc,l) is always infinity since Pebble can increase up to
// MaxConcurrentCompactions as soon as it starts falling behind. This is
// represented in the following diagram.
//
//           ^----
//           |    --                  - AC tokens
//           |      --                + actual concurrent compactions
//        cc |****+*+*+*+****         * permitted concurrent compactions
//           |  +       --
//           | +          --------
//           |+
//           |
//           ------------------------->
//                l
//
// Observe that in this diagram, the permitted concurrent compactions is
// always equal to MaxConcurrentCompactions, and the actual concurrent
// compactions ramps up very quickly to that permitted level as l increases.
// The AC tokens start of at close to infinity and start declining as l
// increases, but are still higher than the permitted concurrent compactions.
// And the AC tokens fall below the permitted concurrent compactions *after*
// the actual concurrent compactions have reached that permitted level. This
// "fall below" happens to try to reduce the number of sub-levels (since AC
// has an objective of what sub-level count it wants to be stable at).
//
// For the remainder of this discussion we will ignore the actual concurrent
// compactions and just use permitted concurrent compactions to serve both the
// roles of actual and permitted. In this simplified world, the objective we
// have is that AC tokens exceed the permitted concurrent compactions until
// permitted concurrent compactions have reached their max value. When this
// objective is not satisfied, we will unnecessarily throttle traffic even
// though there is the possibility to allow higher traffic since we have not
// yet used up to the permitted concurrent compactions.
//
// Note, we are depicting AC tokens in terms of overall compaction concurrency
// in this analysis, while real AC tokens are based on compactions out of L0,
// and some of the compactions are happening between other levels. This is
// fine if we reinterpret what we discuss here as AC tokens as not the real AC
// tokens but the effect of the real AC tokens on the overall compaction
// bandwidth needed in the LSM. To illustrate this idea, say 25% of the
// compaction concurrency is spent on L0=>Lbase compactions and 75% on other
// compactions. Say current permitted compaction concurrency is 4 (so
// L0=>Lbase compaction concurrency is 1) and MaxConcurrentCompactions is 8.
// And say that real AC tokens throttles traffic to this current level that
// can be compacted out of L0. Then in the analysis here, we consider AC
// tokens as shaping to a compaction concurrency of 4.
//
// As a reminder,
//
// permitted-concurrent-compactions = min(upper-bound-cc(lslcc,l), MaxConcurrentCompactions)
//
// We analyze AC tokens also expressed in units of compaction concurrency,
// where ac-tokens are a function of l and the current
// permitted-concurrent-compactions (since permitted==actual, in our
// simplification), which we can write as
// ac-tokens(l,permitted-concurrent-compactions). We consider two parts of
// ac-tokens: the first part when upper-bound-cc(lslcc,l) <=
// MaxConcurrentCompactions, and the second part when upper-bound-cc(lslcc,l)
// > MaxConcurrentCompactions. There is a transition from the first part to
// the second part at some point as l increases. In the first part,
// permitted-concurrent-compactions=upper-bound-cc(lslcc,l), and so ac-tokens
// is a function of l and upper-bound-cc. We translate our original objective
// into the following simplified objective for the first part:
//
// ac-tokens should be greater than upper-bound-cc as l increases, and it
// should be equal to or greater than upper-bound-cc when upper-bound-cc
// becomes equal to MaxConcurrentCompactions.
//
// The following diagram shows an example that achieves this objective:
//
//           ^
//           |-------
//           |       --      +              - ac-tokens
//           |         --  +                + upper-bound-cc
//        cc |*********+*--***********      * MaxConcurrentCompactions
//           |      +       --
//           |   +            -----
//           |+
//           |
//           ------------------------->
//                l
//
// Note that the objective does not say anything about ac-tokens after
// upper-bound-cc exceeds MaxConcurrentCompactions since what happens at
// higher l values did not prevent us from achieving the maximum compaction
// concurrency.
//
// ac-tokens for regular traffic with lslcc=2:
//
// Admission control (see admission.ioLoadListener) starts shaping regular
// traffic at a sub-level count of 5, with twice the tokens as compaction
// bandwidth (out of L0) at sub-level count 5, and tokens equal to the
// compaction bandwidth at sub-level count of 10. AC wants to operate at a
// stable point of 10 sub-levels under regular traffic overload. Let
// MaxConcurrentCompactions be represented as mcc. At sub-level count 5, the
// upper-bound-cc is floor(1+5/2)=3, so ac-tokens are representative of a
// concurrency of min(3,mcc)*2. At sub-level count of 10, the upper-bound-cc
// is floor(1+10/2)=6, so tokens are also representative of a compaction
// concurrency of min(6,mcc)*1. This regular traffic token shaping behavior is
// hard-wired in ioLoadListener (with code constants), and we don't currently
// have a reason to change it. If MaxConcurrentCompactions is <= 6, the
// objective stated earlier is achieved, since min(3,mcc)*2 and min(6,mcc) are
// >= mcc. But if MaxConcurrentCompactions > 6, AC will throttle to compaction
// concurrency of 6, which fails the objective since we have ignored the
// ability to increase compaction concurrency. Note that this analysis is
// somewhat pessimistic since if we are consistently operating at 5 or more
// sub-levels, other levels in the LSM are also building up compaction debt,
// and there is another mechanism in Pebble that increases compaction
// concurrency in response to compaction debt. Nevertheless, this pessimistic
// analysis shows that we are ok with MaxConcurrentCompactions <= 6. We will
// consider the possibility of reducing lslcc below.
//
// ac-tokens for elastic traffic with lslcc=2:
//
// For elastic traffic, admission control starts shaping traffic at sub-level
// count of 2, with tokens equal to 1.25x the compaction bandwidth, so
// ac-tokens is 1.25*min(mcc,floor(1+2/2))=1.25*min(mcc,2) compaction
// concurrency. And at sub-level count of 4, the tokens are equal to 1x the
// compaction bandwidth, so ac-tokens is 1*min(mcc,floor(1+4/2))=min(mcc,3)
// compaction concurrency. AC wants to operate at a stable point of 4
// sub-levels under elastic traffic overload. For mcc=3 (the default value),
// the above values at l=2 and l=4 are 2.5 and 3 respectively. Even though 2.5
// < 3, we deem the value of ac-tokens as acceptable with mcc=3. But
// deployments which use machines with a large number of CPUs are sometimes
// configured with a larger value of MaxConcurrentCompactions. In those cases
// elastic traffic will be throttled even though there is an opportunity to
// increase compaction concurrency to allow more elastic traffic.
//
// If a deployment administrator knows that the system is provisioned such
// that aggressively increasing up to MaxConcurrentCompactions is harmless to
// foreground traffic, they can set l0SubLevelCompactionConcurrency=1. The
// ac-tokens will be:
//
//   - Elastic: sub-level=2, 1.25*min(mcc,3); sub-level=4, 1*min(mcc,5);
//     sub-level=6, 0.75*min(mcc,7). With mcc=4, at sub-level=2, we get
//     ac-tokens=3.75, which is deemed acceptable. Also, compaction debt will
//     increase and allow for utilizing even higher concurrency, eventually,
//     and since this is elastic traffic that eventual behavior is acceptable.
//
//   - Regular: sub-level=5, 2*min(mcc,6); sub-level=10, 1*min(mcc,11). With
//     mcc=12, at sub-level=5, we get ac-tokens=12. So we are satisfying the
//     objective even if mcc were as high as 12.
var l0SubLevelCompactionConcurrency = envutil.EnvOrDefaultInt(
        "COCKROACH_L0_SUB_LEVEL_CONCURRENCY", 2)

// MakeValue returns the inline value.
func MakeValue(meta enginepb.MVCCMetadata) roachpb.Value {
        return roachpb.Value{RawBytes: meta.RawBytes}
}

func emptyKeyError() error {
        // TODO(nvanbenschoten): this error, along with many in this file, should be
        // converted to an errors.AssertionFailed error.
        return errors.Errorf("attempted access to empty key")
}

// MVCCKeyValue contains the raw bytes of the value for a key.
type MVCCKeyValue struct {
        Key MVCCKey
        // if Key.IsValue(), Value is an encoded MVCCValue.
        //             else, Value is an encoded MVCCMetadata.
        Value []byte
}

// MVCCRangeKeyValue contains the raw bytes of the value for a key.
type MVCCRangeKeyValue struct {
        RangeKey MVCCRangeKey
        Value    []byte
}

// Clone returns a copy of the MVCCRangeKeyValue.
func (r MVCCRangeKeyValue) Clone() MVCCRangeKeyValue {
        r.RangeKey = r.RangeKey.Clone()
        if r.Value != nil {
                r.Value = append([]byte{}, r.Value...)
        }
        return r
}

// optionalValue represents an optional MVCCValue. It is preferred
// over a *roachpb.Value or *MVCCValue to avoid the forced heap allocation.
type optionalValue struct {
        MVCCValue
        exists bool
}

func makeOptionalValue(v MVCCValue) optionalValue {
        return optionalValue{MVCCValue: v, exists: true}
}

func (v *optionalValue) IsPresent() bool {
        return v.exists && v.Value.IsPresent()
}

func (v *optionalValue) IsTombstone() bool {
        return v.exists && !v.Value.IsPresent()
}

func (v *optionalValue) ToPointer() *roachpb.Value {
        if !v.exists {
                return nil
        }
        // Copy to prevent forcing receiver onto heap.
        cpy := v.Value
        return &cpy
}

func (v *optionalValue) isOriginTimestampWinner(
        proposedTS hlc.Timestamp, inclusive bool,
) (bool, hlc.Timestamp) {
        if !v.exists {
                return true, hlc.Timestamp{}
        }

        existTS := v.Value.Timestamp
        if v.MVCCValueHeader.OriginTimestamp.IsSet() {
                existTS = v.MVCCValueHeader.OriginTimestamp
        }

        return existTS.Less(proposedTS) || (inclusive && existTS.Equal(proposedTS)), existTS
}

// isSysLocal returns whether the key is system-local.
func isSysLocal(key roachpb.Key) bool {
        return key.Compare(keys.LocalMax) < 0
}

// isAbortSpanKey returns whether the key is an abort span key.
func isAbortSpanKey(key roachpb.Key) bool {
        if !bytes.HasPrefix(key, keys.LocalRangeIDPrefix) {
                return false
        }

        _ /* rangeID */, infix, suffix, _ /* detail */, err := keys.DecodeRangeIDKey(key)
        if err != nil {
                return false
        }
        hasAbortSpanSuffix := infix.Equal(keys.LocalRangeIDReplicatedInfix) && suffix.Equal(keys.LocalAbortSpanSuffix)
        return hasAbortSpanSuffix
}

// updateStatsForInline updates stat counters for an inline value
// (abort span entries for example). These are simpler as they don't
// involve intents, multiple versions, or MVCC range tombstones.
func updateStatsForInline(
        ms *enginepb.MVCCStats,
        key roachpb.Key,
        origMetaKeySize, origMetaValSize, metaKeySize, metaValSize int64,
) {
        sys := isSysLocal(key)
        // Remove counts for this key if the original size is non-zero.
        if origMetaKeySize != 0 {
                if sys {
                        ms.SysBytes -= (origMetaKeySize + origMetaValSize)
                        ms.SysCount--
                        // We only do this check in updateStatsForInline since
                        // abort span keys are always inlined - we don't associate
                        // timestamps with them.
                        if isAbortSpanKey(key) {
                                ms.AbortSpanBytes -= (origMetaKeySize + origMetaValSize)
                        }
                } else {
                        ms.LiveBytes -= (origMetaKeySize + origMetaValSize)
                        ms.LiveCount--
                        ms.KeyBytes -= origMetaKeySize
                        ms.ValBytes -= origMetaValSize
                        ms.KeyCount--
                        ms.ValCount--
                }
        }
        // Add counts for this key if the new size is non-zero.
        if metaKeySize != 0 {
                if sys {
                        ms.SysBytes += metaKeySize + metaValSize
                        ms.SysCount++
                        if isAbortSpanKey(key) {
                                ms.AbortSpanBytes += metaKeySize + metaValSize
                        }
                } else {
                        ms.LiveBytes += metaKeySize + metaValSize
                        ms.LiveCount++
                        ms.KeyBytes += metaKeySize
                        ms.ValBytes += metaValSize
                        ms.KeyCount++
                        ms.ValCount++
                }
        }
}

// updateStatsOnMerge updates metadata stats while merging inlined
// values. Unfortunately, we're unable to keep accurate stats on merges as the
// actual details of the merge play out asynchronously during compaction. We
// actually undercount by only adding the size of the value.RawBytes byte slice
// (and eliding MVCCVersionTimestampSize, corresponding to the metadata overhead,
// even for the very "first" write). These errors are corrected during splits and
// merges.
func updateStatsOnMerge(key roachpb.Key, valSize, nowNanos int64) enginepb.MVCCStats {
        var ms enginepb.MVCCStats
        sys := isSysLocal(key)
        ms.AgeTo(nowNanos)

        ms.ContainsEstimates = 1

        if sys {
                ms.SysBytes += valSize
        } else {
                ms.LiveBytes += valSize
                ms.ValBytes += valSize
        }
        return ms
}

// updateStatsOnPut updates stat counters for a newly put value,
// including both the metadata key & value bytes and the mvcc
// versioned value's key & value bytes. If the value is not a
// deletion tombstone, updates the live stat counters as well.
// If this value is an intent, updates the intent counters.
func updateStatsOnPut(
        key roachpb.Key,
        prevIsValue bool,
        prevValSize int64,
        origMetaKeySize, origMetaValSize, metaKeySize, metaValSize int64,
        orig, meta *enginepb.MVCCMetadata,
) enginepb.MVCCStats {
        var ms enginepb.MVCCStats

        if isSysLocal(key) {
                // Handling system-local keys is straightforward because
                // we don't track ageable quantities for them (we
                // could, but don't). Remove the contributions from the
                // original, if any, and add in the new contributions.
                if orig != nil {
                        ms.SysBytes -= origMetaKeySize + origMetaValSize
                        if orig.Txn != nil {
                                // If the original value was an intent, we're replacing the
                                // intent. Note that since it's a system key, it doesn't affect
                                // IntentByte, IntentCount, and correspondingly, LockAge.
                                ms.SysBytes -= orig.KeyBytes + orig.ValBytes
                        }
                        ms.SysCount--
                }
                ms.SysBytes += meta.KeyBytes + meta.ValBytes + metaKeySize + metaValSize
                ms.SysCount++
                return ms
        }

        // Handle non-sys keys. This follows the same scheme: if there was a previous
        // value, perhaps even an intent, subtract its contributions, and then add the
        // new contributions. The complexity here is that we need to properly update
        // GCBytesAge and LockAge, which don't follow the same semantics. The difference
        // between them is that an intent accrues LockAge from its own timestamp on,
        // while GCBytesAge is accrued by versions according to the following rules:
        // 1. a (non-tombstone) value that is shadowed by a newer write accrues age at
        //           the point in time at which it is shadowed (i.e. the newer write's timestamp).
        // 2. a tombstone value accrues age at its own timestamp (note that this means
        //    the tombstone's own contribution only -- the actual write that was deleted
        //    is then shadowed by this tombstone, and will thus also accrue age from
        //    the tombstone's value on, as per 1).
        //
        // This seems relatively straightforward, but only because it omits pesky
        // details, which have been relegated to the comments below.

        // Remove current live counts for this key.
        if orig != nil {
                ms.KeyCount--

                // Move the (so far empty) stats to the timestamp at which the
                // previous entry was created, which is where we wish to reclassify
                // its contributions.
                ms.AgeTo(orig.Timestamp.WallTime)

                // If the original metadata for this key was an intent, subtract
                // its contribution from stat counters as it's being replaced.
                if orig.Txn != nil {
                        // Subtract counts attributable to intent we're replacing.
                        ms.ValCount--
                        ms.IntentBytes -= (orig.KeyBytes + orig.ValBytes)
                        ms.IntentCount--
                        ms.LockCount--
                }

                // If the original intent is a deletion, we're removing the intent. This
                // means removing its contribution at the *old* timestamp because it has
                // accrued GCBytesAge that we need to offset (rule 2).
                //
                // Note that there is a corresponding block for the case of a non-deletion
                // (rule 1) below, at meta.Timestamp.
                if orig.Deleted {
                        ms.KeyBytes -= origMetaKeySize
                        ms.ValBytes -= origMetaValSize

                        if orig.Txn != nil {
                                ms.KeyBytes -= orig.KeyBytes
                                ms.ValBytes -= orig.ValBytes
                        }
                }

                // Rule 1 implies that sometimes it's not only the old meta and the new meta
                // that matter, but also the version below both of them. For example, take
                // a version at t=1 and an intent over it at t=2 that is now being replaced
                // (t=3). Then orig.Timestamp will be 2, and meta.Timestamp will be 3, but
                // rule 1 tells us that for the interval [2,3) we have already accrued
                // GCBytesAge for the version at t=1 that is now moot, because the intent
                // at t=2 is moving to t=3; we have to emit a GCBytesAge offset to that effect.
                //
                // The code below achieves this by making the old version live again at
                // orig.Timestamp, and then marking it as shadowed at meta.Timestamp below.
                // This only happens when that version wasn't a tombstone, in which case it
                // contributes from its own timestamp on anyway, and doesn't need adjustment.
                //
                // Note that when meta.Timestamp equals orig.Timestamp, the computation is
                // moot, which is something our callers may exploit (since retrieving the
                // previous version is not for free).
                if prevIsValue {
                        // If the previous value (exists and) was not a deletion tombstone, make it
                        // live at orig.Timestamp. We don't have to do anything if there is a
                        // previous value that is a tombstone: according to rule two its age
                        // contributions are anchored to its own timestamp, so moving some values
                        // higher up doesn't affect the contributions tied to that key.
                        ms.LiveBytes += MVCCVersionTimestampSize + prevValSize
                }

                // Note that there is an interesting special case here: it's possible that
                // meta.Timestamp.WallTime < orig.Timestamp.WallTime. This wouldn't happen
                // outside of tests (due to our semantics of txn.ReadTimestamp, which never
                // decreases) but it sure does happen in randomized testing. An earlier
                // version of the code used `Forward` here, which is incorrect as it would be
                // a no-op and fail to subtract out the intent bytes/GC age incurred due to
                // removing the meta entry at `orig.Timestamp` (when `orig != nil`).
                ms.AgeTo(meta.Timestamp.WallTime)

                if prevIsValue {
                        // Make the previous non-deletion value non-live again, as explained in the
                        // sibling block above.
                        ms.LiveBytes -= MVCCVersionTimestampSize + prevValSize
                }

                // If the original version wasn't a deletion, it becomes non-live at meta.Timestamp
                // as this is where it is shadowed.
                if !orig.Deleted {
                        ms.LiveBytes -= orig.KeyBytes + orig.ValBytes
                        ms.LiveBytes -= origMetaKeySize + origMetaValSize
                        ms.LiveCount--

                        ms.KeyBytes -= origMetaKeySize
                        ms.ValBytes -= origMetaValSize

                        if orig.Txn != nil {
                                ms.KeyBytes -= orig.KeyBytes
                                ms.ValBytes -= orig.ValBytes
                        }
                }
        } else {
                ms.AgeTo(meta.Timestamp.WallTime)
        }

        // If the new version isn't a deletion tombstone, add it to live counters.
        if !meta.Deleted {
                ms.LiveBytes += meta.KeyBytes + meta.ValBytes + metaKeySize + metaValSize
                ms.LiveCount++
        }
        ms.KeyBytes += meta.KeyBytes + metaKeySize
        ms.ValBytes += meta.ValBytes + metaValSize
        ms.KeyCount++
        ms.ValCount++
        if meta.Txn != nil {
                ms.IntentBytes += meta.KeyBytes + meta.ValBytes
                ms.IntentCount++
                ms.LockCount++
        }
        return ms
}

// updateStatsOnResolve updates stat counters with the difference
// between the original and new metadata sizes. The size of the
// resolved value (key & bytes) are subtracted from the intents
// counters if commit=true.
func updateStatsOnResolve(
        key roachpb.Key,
        prevIsValue bool,
        prevValSize int64,
        origMetaKeySize, origMetaValSize, metaKeySize, metaValSize int64,
        orig, meta *enginepb.MVCCMetadata,
        commit bool,
) enginepb.MVCCStats {
        var ms enginepb.MVCCStats

        if isSysLocal(key) {
                // Straightforward: old contribution goes, new contribution comes, and we're done.
                ms.SysBytes -= origMetaKeySize + origMetaValSize + orig.KeyBytes + orig.ValBytes
                ms.SysBytes += metaKeySize + metaValSize + meta.KeyBytes + meta.ValBytes
                return ms
        }

        // In the main case, we had an old intent at orig.Timestamp, and a new intent
        // or value at meta.Timestamp. We'll walk through the contributions below,
        // taking special care for LockAge and GCBytesAge.
        //
        // Jump into the method below for extensive commentary on their semantics
        // and "rules one and two".
        _ = updateStatsOnPut

        ms.AgeTo(orig.Timestamp.WallTime)

        // At orig.Timestamp, the original meta key disappears. Fortunately, the
        // GCBytesAge computations are fairly transparent because the intent is either
        // not a deletion in which case it is always live (it's the most recent value,
        // so it isn't shadowed -- see rule 1), or it *is* a deletion, in which case
        // its own timestamp is where it starts accruing GCBytesAge (rule 2).
        ms.KeyBytes -= origMetaKeySize + orig.KeyBytes
        ms.ValBytes -= origMetaValSize + orig.ValBytes

        // Next, we adjust LiveBytes based on meta.Deleted and orig.Deleted.
        // Note that LiveBytes here corresponds to ts = orig.Timestamp.WallTime.
        // LiveBytes at ts = meta.Timestamp.WallTime is adjusted below.
        // If the original value was deleted, there is no need to adjust the
        // contribution of the original key and value to LiveBytes. Otherwise, we need
        // to subtract the original key and value's contribution from LiveBytes.
        if !orig.Deleted {
                ms.LiveBytes -= origMetaKeySize + origMetaValSize
                ms.LiveBytes -= orig.KeyBytes + orig.ValBytes
                ms.LiveCount--
        }

        // LockAge is always accrued from the intent's own timestamp on.
        ms.IntentBytes -= orig.KeyBytes + orig.ValBytes
        ms.IntentCount--
        ms.LockCount--

        // If there was a previous value (before orig.Timestamp), and it was not a
        // deletion tombstone, then we have to adjust its GCBytesAge contribution
        // which was previously anchored at orig.Timestamp and now has to move to
        // meta.Timestamp. Paralleling very similar code in the method below, this
        // is achieved by making the previous key live between orig.Timestamp and
        // meta.Timestamp. When the two are equal, this will be a zero adjustment,
        // and so in that case the caller may simply pass prevValSize=0 and can
        // skip computing that quantity in the first place.
        _ = updateStatsOnPut

        if prevIsValue {
                ms.LiveBytes += MVCCVersionTimestampSize + prevValSize
        }

        ms.AgeTo(meta.Timestamp.WallTime)

        if prevIsValue {
                // The previous non-deletion value becomes non-live at meta.Timestamp.
                // See the sibling code above.
                ms.LiveBytes -= MVCCVersionTimestampSize + prevValSize
        }

        // At meta.Timestamp, the new meta key appears.
        ms.KeyBytes += metaKeySize + meta.KeyBytes
        ms.ValBytes += metaValSize + meta.ValBytes

        // The new meta key appears.
        if !meta.Deleted {
                ms.LiveBytes += (metaKeySize + metaValSize) + (meta.KeyBytes + meta.ValBytes)
                ms.LiveCount++
        }

        if !commit {
                // If not committing, the intent reappears (but at meta.Timestamp).
                //
                // This is the case in which an intent is pushed (a similar case
                // happens when an intent is overwritten, but that's handled in
                // updateStatsOnPut, not this method).
                ms.IntentBytes += meta.KeyBytes + meta.ValBytes
                ms.IntentCount++
                ms.LockCount++
        }
        return ms
}

// updateStatsOnAcquireLock updates MVCCStats for acquiring a replicated shared
// or exclusive lock on a key. If orig is not nil, the lock acquisition is
// replacing an existing lock with a new lock that has the exact same txn ID and
// strength.
func updateStatsOnAcquireLock(
        origKeySize, origValSize, keySize, valSize int64, orig, meta *enginepb.MVCCMetadata,
) enginepb.MVCCStats {
        var ms enginepb.MVCCStats

        // Remove current lock counts.
        if orig != nil {
                // Move the (so far empty) stats to the timestamp at which the previous
                // lock was acquired, which is where we wish to reclassify its initial
                // contributions.
                ms.AgeTo(orig.Timestamp.WallTime)

                // Subtract counts attributable to the lock we're replacing.
                ms.LockBytes -= origKeySize + origValSize
                ms.LockCount--
        }

        // Now add in the contributions from the new lock at the new acquisition
        // timestamp.
        ms.AgeTo(meta.Timestamp.WallTime)
        ms.LockBytes += keySize + valSize
        ms.LockCount++
        return ms
}

// updateStatsOnReleaseLock updates MVCCStats for releasing a replicated shared
// or exclusive lock on a key. orig is the lock being released, and must not be
// nil.
func updateStatsOnReleaseLock(
        origKeySize, origValSize int64, orig *enginepb.MVCCMetadata,
) enginepb.MVCCStats {
        var ms enginepb.MVCCStats
        ms.AgeTo(orig.Timestamp.WallTime)
        ms.LockBytes -= origKeySize + origValSize
        ms.LockCount--
        return ms
}

// updateStatsOnRangeKeyClear updates MVCCStats for clearing an entire
// range key stack.
func updateStatsOnRangeKeyClear(rangeKeys MVCCRangeKeyStack) enginepb.MVCCStats {
        var ms enginepb.MVCCStats
        ms.Subtract(updateStatsOnRangeKeyPut(rangeKeys))
        return ms
}

// updateStatsOnRangeKeyClearVersion updates MVCCStats for clearing a single
// version in a range key stack. The given range key stack must be before the
// clear.
func updateStatsOnRangeKeyClearVersion(
        rangeKeys MVCCRangeKeyStack, version MVCCRangeKeyVersion,
) enginepb.MVCCStats {
        var ms enginepb.MVCCStats

        // If we're removing the newest version, hide it from the slice such that we
        // can invert the put contribution.
        if version.Timestamp.Equal(rangeKeys.Newest()) {
                if rangeKeys.Len() == 1 {
                        ms.Add(updateStatsOnRangeKeyClear(rangeKeys))
                        return ms
                }
                rangeKeys.Versions = rangeKeys.Versions[1:]
        }

        ms.Subtract(updateStatsOnRangeKeyPutVersion(rangeKeys, version))
        return ms
}

// updateStatsOnRangeKeyPut updates MVCCStats for writing a new range key stack.
func updateStatsOnRangeKeyPut(rangeKeys MVCCRangeKeyStack) enginepb.MVCCStats {
        var ms enginepb.MVCCStats
        ms.AgeTo(rangeKeys.Newest().WallTime)
        ms.RangeKeyCount++
        ms.RangeKeyBytes += int64(EncodedMVCCKeyPrefixLength(rangeKeys.Bounds.Key)) +
                int64(EncodedMVCCKeyPrefixLength(rangeKeys.Bounds.EndKey))
        for _, v := range rangeKeys.Versions {
                ms.AgeTo(v.Timestamp.WallTime)
                ms.RangeKeyBytes += int64(EncodedMVCCTimestampSuffixLength(v.Timestamp))
                ms.RangeValCount++
                ms.RangeValBytes += int64(len(v.Value))
        }
        return ms
}

// updateStatsOnRangeKeyPutVersion updates MVCCStats for writing a new range key
// version in an existing range key stack. The given range key stack must be
// before the put.
func updateStatsOnRangeKeyPutVersion(
        rangeKeys MVCCRangeKeyStack, version MVCCRangeKeyVersion,
) enginepb.MVCCStats {
        var ms enginepb.MVCCStats

        // We currently assume all range keys are MVCC range tombstones. We therefore
        // have to move the GCBytesAge contribution of the key up from the latest
        // version to the new version if it's written at the top.
        if rangeKeys.Newest().Less(version.Timestamp) {
                keyBytes := int64(EncodedMVCCKeyPrefixLength(rangeKeys.Bounds.Key)) +
                        int64(EncodedMVCCKeyPrefixLength(rangeKeys.Bounds.EndKey))
                ms.AgeTo(rangeKeys.Newest().WallTime)
                ms.RangeKeyBytes -= keyBytes
                ms.AgeTo(version.Timestamp.WallTime)
                ms.RangeKeyBytes += keyBytes
        }

        // Account for the new version.
        ms.AgeTo(version.Timestamp.WallTime)
        ms.RangeKeyBytes += int64(EncodedMVCCTimestampSuffixLength(version.Timestamp))
        ms.RangeValCount++
        ms.RangeValBytes += int64(len(version.Value))

        return ms
}

// updateStatsOnRangeKeyCover updates MVCCStats for when an MVCC range key
// covers an MVCC point key at the given timestamp. The valueLen and
// isTombstone are attributes of the point key.
func updateStatsOnRangeKeyCover(
        ts hlc.Timestamp, key MVCCKey, valueLen int, isTombstone bool,
) enginepb.MVCCStats {
        var ms enginepb.MVCCStats
        ms.AgeTo(ts.WallTime)
        if !isTombstone {
                ms.LiveCount--
                ms.LiveBytes -= int64(key.EncodedSize()) + int64(valueLen)
        }
        return ms
}

// updateStatsOnRangeKeyCoverStats updates MVCCStats for when an MVCC range
// tombstone covers existing data whose stats are already known.
func updateStatsOnRangeKeyCoverStats(ts hlc.Timestamp, cur enginepb.MVCCStats) enginepb.MVCCStats {
        var ms enginepb.MVCCStats
        ms.AgeTo(ts.WallTime)
        ms.ContainsEstimates += cur.ContainsEstimates
        ms.LiveCount -= cur.LiveCount
        ms.LiveBytes -= cur.LiveBytes
        return ms
}

// updateStatsOnRangeKeyMerge updates MVCCStats for a merge of two MVCC range
// key stacks. Both sides of the merge must have identical versions. The merge
// can happen either to the right or the left, only the merge key (i.e. the key
// where the stacks abut) is needed. versions can't be empty.
func updateStatsOnRangeKeyMerge(
        mergeKey roachpb.Key, versions MVCCRangeKeyVersions,
) enginepb.MVCCStats {
        // A merge is simply the inverse of a split.
        var ms enginepb.MVCCStats
        ms.Subtract(UpdateStatsOnRangeKeySplit(mergeKey, versions))
        return ms
}

// UpdateStatsOnRangeKeySplit updates MVCCStats for the split/fragmentation of a
// range key stack at a given split key. versions can't be empty.
func UpdateStatsOnRangeKeySplit(
        splitKey roachpb.Key, versions MVCCRangeKeyVersions,
) enginepb.MVCCStats {
        var ms enginepb.MVCCStats

        // Account for the creation of one of the range key stacks, and the key
        // contribution of the end and start keys of the split stacks.
        ms.AgeTo(versions[0].Timestamp.WallTime)
        ms.RangeKeyCount++
        ms.RangeKeyBytes += 2 * int64(EncodedMVCCKeyPrefixLength(splitKey))

        // Account for the creation of all versions in new new stack.
        for _, v := range versions {
                ms.AgeTo(v.Timestamp.WallTime)
                ms.RangeValCount++
                ms.RangeKeyBytes += int64(EncodedMVCCTimestampSuffixLength(v.Timestamp))
                ms.RangeValBytes += int64(len(v.Value))
        }

        return ms
}

// updateStatsOnClear updates stat counters by subtracting a
// cleared value's key and value byte sizes. If an earlier version
// was restored, the restored values are added to live bytes and
// count if the restored value isn't a deletion tombstone.
func updateStatsOnClear(
        key roachpb.Key,
        origMetaKeySize, origMetaValSize, restoredMetaKeySize, restoredMetaValSize int64,
        orig, restored *enginepb.MVCCMetadata,
        restoredNanos int64,
) enginepb.MVCCStats {

        var ms enginepb.MVCCStats

        if isSysLocal(key) {
                if restored != nil {
                        ms.SysBytes += restoredMetaKeySize + restoredMetaValSize
                        ms.SysCount++
                }

                ms.SysBytes -= (orig.KeyBytes + orig.ValBytes) + (origMetaKeySize + origMetaValSize)
                ms.SysCount--
                return ms
        }

        // If we're restoring a previous value (which is thus not an intent), there are
        // two main cases:
        //
        // 1. the previous value is a tombstone, so according to rule 2 it accrues
        // GCBytesAge from its own timestamp on (we need to adjust only for the
        // implicit meta key that "pops up" at that timestamp), -- or --
        // 2. it is not, and it has been shadowed by the key we are clearing,
        // in which case we need to offset its GCBytesAge contribution from
        // restoredNanos to orig.Timestamp (rule 1).
        if restored != nil {
                if restored.Txn != nil {
                        panic("restored version should never be an intent")
                }

                ms.AgeTo(restoredNanos)

                if restored.Deleted {
                        // The new meta key will be implicit and at restoredNanos. It needs to
                        // catch up on the GCBytesAge from that point on until orig.Timestamp
                        // (rule 2).
                        ms.KeyBytes += restoredMetaKeySize
                        ms.ValBytes += restoredMetaValSize
                }

                ms.AgeTo(orig.Timestamp.WallTime)

                ms.KeyCount++

                if !restored.Deleted {
                        // At orig.Timestamp, make the non-deletion version live again.
                        // Note that there's no need to explicitly age to the "present time"
                        // after.
                        ms.KeyBytes += restoredMetaKeySize
                        ms.ValBytes += restoredMetaValSize

                        ms.LiveBytes += restored.KeyBytes + restored.ValBytes
                        ms.LiveCount++
                        ms.LiveBytes += restoredMetaKeySize + restoredMetaValSize
                }
        } else {
                ms.AgeTo(orig.Timestamp.WallTime)
        }

        if !orig.Deleted {
                ms.LiveBytes -= (orig.KeyBytes + orig.ValBytes) + (origMetaKeySize + origMetaValSize)
                ms.LiveCount--
        }
        ms.KeyBytes -= (orig.KeyBytes + origMetaKeySize)
        ms.ValBytes -= (orig.ValBytes + origMetaValSize)
        ms.KeyCount--
        ms.ValCount--
        if orig.Txn != nil {
                ms.IntentBytes -= (orig.KeyBytes + orig.ValBytes)
                ms.IntentCount--
                ms.LockCount--
        }
        return ms
}

// updateStatsOnGC updates stat counters after garbage collection
// by subtracting key and value byte counts, updating key and
// value counts, and updating the GC'able bytes age. If metaKey is
// true, then the value being GC'd is the mvcc metadata and we
// decrement the key count.
//
// nonLiveMS is the timestamp at which the value became non-live.
// For a deletion tombstone this will be its own timestamp (rule two
// in updateStatsOnPut) and for a regular version it will be the closest
// newer version's (rule one).
func updateStatsOnGC(
        key roachpb.Key, keySize, valSize int64, metaKey bool, nonLiveMS int64,
) enginepb.MVCCStats {
        var ms enginepb.MVCCStats

        if isSysLocal(key) {
                ms.SysBytes -= keySize + valSize
                if metaKey {
                        ms.SysCount--
                }
                return ms
        }

        ms.AgeTo(nonLiveMS)
        ms.KeyBytes -= keySize
        ms.ValBytes -= valSize
        if metaKey {
                ms.KeyCount--
        } else {
                ms.ValCount--
        }
        return ms
}

// MVCCGetProto fetches the value at the specified key and unmarshals it into
// msg if msg is non-nil. Returns true on success or false if the key was not
// found.
//
// See the documentation for MVCCGet for the semantics of the MVCCGetOptions.
func MVCCGetProto(
        ctx context.Context,
        reader Reader,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        msg protoutil.Message,
        opts MVCCGetOptions,
) (bool, error) {
        // TODO(tschottdorf): Consider returning skipped intents to the caller.
        valueRes, mvccGetErr := MVCCGet(ctx, reader, key, timestamp, opts)
        found := valueRes.Value != nil
        // If we found a result, parse it regardless of the error returned by MVCCGet.
        if found && msg != nil {
                // If the unmarshal failed, return its result. Otherwise, pass
                // through the underlying error (which may be a LockConflictError
                // to be handled specially alongside the returned value).
                if err := valueRes.Value.GetProto(msg); err != nil {
                        return found, err
                }
        }
        return found, mvccGetErr
}

// MVCCPutProto sets the given key to the protobuf-serialized byte
// string of msg and the provided timestamp.
func MVCCPutProto(
        ctx context.Context,
        rw ReadWriter,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        msg protoutil.Message,
        opts MVCCWriteOptions,
) error {
        value := roachpb.Value{}
        if err := value.SetProto(msg); err != nil {
                return err
        }
        value.InitChecksum(key)
        _, err := MVCCPut(ctx, rw, key, timestamp, value, opts)
        return err
}

// MVCCBlindPutProto sets the given key to the protobuf-serialized byte string
// of msg and the provided timestamp. See MVCCBlindPut for a discussion on this
// fast-path and when it is appropriate to use.
func MVCCBlindPutProto(
        ctx context.Context,
        writer Writer,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        msg protoutil.Message,
        opts MVCCWriteOptions,
) error {
        value := roachpb.Value{}
        if err := value.SetProto(msg); err != nil {
                return err
        }
        value.InitChecksum(key)
        _, err := MVCCBlindPut(ctx, writer, key, timestamp, value, opts)
        return err
}

// MVCCBlindPutInlineWithPrev updates an inline value using a blind put when the
// previous value is known. The previous value is used to update MVCC stats.
func MVCCBlindPutInlineWithPrev(
        ctx context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        key roachpb.Key,
        value, prev roachpb.Value,
) error {
        // MVCCBlindPut() will update stats for the new key as if there was no
        // existing key. Adjust stats for the removal of the previous value, if any.
        var origMetaKeySize, origMetaValSize int64
        if prev.IsPresent() && ms != nil {
                origMetaKeySize = int64(MVCCKey{Key: key}.EncodedSize())
                origMetaValSize = int64((&enginepb.MVCCMetadata{RawBytes: prev.RawBytes}).Size())
                updateStatsForInline(ms, key, origMetaKeySize, origMetaValSize, 0, 0)
        }
        // Assert correct stats. Must be enabled manually, because the primary caller
        // is lease requests, and these can race with concurrent lease requests since
        // they don't hold latches. That's ok, because the lease request will be
        // rejected below Raft in that case, but it would trip this assertion. We have
        // plenty of other tests and assertions for this.
        if false && ms != nil {
                iter, err := newMVCCIterator(
                        ctx, rw, hlc.Timestamp{}, false /* rangeKeyMasking */, false, /* noInterleavedIntents */
                        IterOptions{
                                KeyTypes: IterKeyTypePointsAndRanges,
                                Prefix:   true,
                                // Don't bother with ReadCategory.
                        },
                )
                if err != nil {
                        return err
                }
                defer iter.Close()
                var meta enginepb.MVCCMetadata
                ok, metaKeySize, metaValSize, _, err := mvccGetMetadata(iter, MVCCKey{Key: key}, &meta)
                if err != nil {
                        return err
                }
                if ok != prev.IsPresent() || metaKeySize != origMetaKeySize || metaValSize != origMetaValSize {
                        log.Fatalf(ctx,
                                "MVCCBlindPutInlineWithPrev IsPresent=%t (%t) origMetaKeySize=%d (%d) origMetaValSize=%d (%d)",
                                prev.IsPresent(), ok, origMetaKeySize, metaKeySize, origMetaValSize, metaValSize)
                }
        }
        // TODO(jackson): Thread origMetaValSize through so that a resulting
        // ClearUnversioned sets ClearOptions.ValueSize[Known].
        acq, err := MVCCBlindPut(ctx, rw, key, hlc.Timestamp{}, value, MVCCWriteOptions{Stats: ms})
        if !acq.Empty() {
                log.Fatal(ctx, "inline write should not be within a transaction; lock acquisition found")
        }
        return err
}

// LockTableView is a request-bound snapshot into an in-memory collections of
// key-level locks. The set of per-key locks stored in the in-memory lock table
// structure overlaps with those stored in the persistent lock table keyspace
// (i.e. intents produced by an MVCCKeyAndIntentsIterKind iterator), but one is
// not a subset of the other. There are locks only stored in the in-memory lock
// table (i.e. unreplicated locks) and locks only stored in the persistent lock
// table keyspace (i.e. replicated locks that have yet to be "discovered").
type LockTableView interface {
        // IsKeyLockedByConflictingTxn returns whether the specified key is locked by
        // a conflicting transaction in the request's snapshot of the lock table,
        // given the request's own desired locking strength. If so, true is returned
        // and so is the lock holder. Otherwise, false is returned.
        //
        // This method is used by requests in conjunction with the SkipLocked wait
        // policy to determine which keys they should skip over during evaluation.
        IsKeyLockedByConflictingTxn(context.Context, roachpb.Key) (bool, *enginepb.TxnMeta, error)
        // Close cleans up the LockTableView; it should not be used after being
        // closed.
        Close()
}

// MVCCGetOptions bundles options for the MVCCGet family of functions.
type MVCCGetOptions struct {
        // See the documentation for MVCCGet for information on these parameters.
        Inconsistent     bool
        SkipLocked       bool
        Tombstones       bool
        FailOnMoreRecent bool
        Txn              *roachpb.Transaction
        ScanStats        *kvpb.ScanStats
        Uncertainty      uncertainty.Interval
        // MemoryAccount is used for tracking memory allocations.
        MemoryAccount *mon.BoundAccount
        // LockTable is used to determine whether keys are locked in the in-memory
        // lock table when scanning with the SkipLocked option.
        LockTable LockTableView
        // DontInterleaveIntents, when set, makes it such that intent metadata is not
        // interleaved with the results of the scan. Setting this option means that
        // the underlying pebble iterator will only scan over the MVCC keyspace and
        // will not use an `intentInterleavingIter`. It is only appropriate to use
        // this when the caller does not need to know whether a given key is an intent
        // or not. It is usually set by read-only requests that have resolved their
        // conflicts before they begin their MVCC scan.
        DontInterleaveIntents bool
        // MaxKeys is the maximum number of kv pairs returned from this operation.
        // The non-negative value represents an unbounded Get. The value -1 returns
        // no keys in the result and a ResumeSpan equal to the request span is
        // returned.
        MaxKeys int64
        // TargetBytes is a byte threshold to limit the amount of data pulled into
        // memory during a Get operation. The zero value indicates no limit. The
        // value -1 returns no keys in the result. A positive value represents an
        // unbounded Get unless AllowEmpty is set. If an empty result is returned,
        // then a ResumeSpan equal to the request span is returned.
        TargetBytes int64
        // AllowEmpty will return an empty result if the request key exceeds the
        // TargetBytes limit.
        AllowEmpty bool
        // ReadCategory is used to map to a user-understandable category string, for
        // stats aggregation and metrics, and a Pebble-understandable QoS.
        ReadCategory fs.ReadCategory
        // ReturnRawMVCCValues indicates the get should return a
        // roachpb.Value whose RawBytes may contain MVCCValueHeader
        // data.
        ReturnRawMVCCValues bool
}

// MVCCGetResult bundles return values for the MVCCGet family of functions.
type MVCCGetResult struct {
        // The most recent value for the specified key whose timestamp is less than
        // or equal to the supplied timestamp. If no such value exists, nil is
        // returned instead.
        Value *roachpb.Value
        // In inconsistent mode, the intent if an intent is encountered. In
        // consistent mode, an intent will generate a LockConflictError with the
        // intent embedded within and the intent parameter will be nil.
        Intent *roachpb.Intent
        // See the documentation for kvpb.ResponseHeader for information on
        // these parameters.
        ResumeSpan      *roachpb.Span
        ResumeReason    kvpb.ResumeReason
        ResumeNextBytes int64
        NumKeys         int64
        NumBytes        int64
}

func (opts *MVCCGetOptions) validate() error {
        if opts.Inconsistent && opts.Txn != nil {
                return errors.Errorf("cannot allow inconsistent reads within a transaction")
        }
        if opts.Inconsistent && opts.SkipLocked {
                return errors.Errorf("cannot allow inconsistent reads with skip locked option")
        }
        if opts.Inconsistent && opts.FailOnMoreRecent {
                return errors.Errorf("cannot allow inconsistent reads with fail on more recent option")
        }
        if opts.DontInterleaveIntents && opts.SkipLocked {
                return errors.Errorf("cannot disable interleaved intents with skip locked option")
        }
        return nil
}

func (opts *MVCCGetOptions) errOnIntents() bool {
        return !opts.Inconsistent && !opts.SkipLocked
}

// MVCCResolveWriteIntentOptions bundles options for the MVCCResolveWriteIntent
// function.
type MVCCResolveWriteIntentOptions struct {
        // See the documentation for MVCCResolveWriteIntent for information on these
        // parameters.
        TargetBytes int64
}

// MVCCResolveWriteIntentRangeOptions bundles options for the
// MVCCResolveWriteIntentRange function.
type MVCCResolveWriteIntentRangeOptions struct {
        // See the documentation for MVCCResolveWriteIntentRange for information on
        // these parameters.
        MaxKeys     int64
        TargetBytes int64
}

// newMVCCIterator sets up a suitable iterator for high-level MVCC operations
// operating at the given timestamp. If timestamp is empty or if
// `noInterleavedIntents` is set, the iterator is considered to be used for
// inline values, disabling intents and range keys. If rangeKeyMasking is true,
// IterOptions.RangeKeyMaskingBelow is set to the given timestamp.
func newMVCCIterator(
        ctx context.Context,
        reader Reader,
        timestamp hlc.Timestamp,
        rangeKeyMasking bool,
        noInterleavedIntents bool,
        opts IterOptions,
) (MVCCIterator, error) {
        // If reading inline then just return a plain MVCCIterator without intents.
        // However, we allow the caller to enable range keys, since they may be needed
        // for conflict checks when writing inline values.
        if timestamp.IsEmpty() {
                return reader.NewMVCCIterator(ctx, MVCCKeyIterKind, opts)
        }
        // Enable range key masking if requested.
        if rangeKeyMasking && opts.KeyTypes != IterKeyTypePointsOnly &&
                opts.RangeKeyMaskingBelow.IsEmpty() {
                opts.RangeKeyMaskingBelow = timestamp
        }
        iterKind := MVCCKeyAndIntentsIterKind
        if noInterleavedIntents {
                iterKind = MVCCKeyIterKind
        }
        return reader.NewMVCCIterator(ctx, iterKind, opts)
}

// MVCCGet returns a MVCCGetResult.
//
// The first field of MVCCGetResult contains the most recent value for the
// specified key whose timestamp is less than or equal to the supplied
// timestamp. If no such value exists, nil is returned instead.
//
// In tombstones mode, if the most recent value is a deletion tombstone, the
// result will be a non-nil roachpb.Value whose RawBytes field is nil.
// Otherwise, a deletion tombstone results in a nil roachpb.Value. MVCC range
// tombstones are emitted as synthetic point tombstones, even if there is no
// existing point key at the position.
//
// NB: Synthetic tombstones generated for MVCC range tombstones may not be
// visible to an MVCCScan, since MVCCScan will only synthesize point tombstones
// above existing point keys.
//
// In inconsistent mode, if an intent is encountered, it will be placed in the
// intent field. By contrast, in consistent mode, an intent will generate a
// LockConflictError with the intent embedded within, and the intent result
// parameter will be nil.
//
// Note that transactional gets must be consistent. Put another way, only
// non-transactional gets may be inconsistent.
//
// If the timestamp is specified as hlc.Timestamp{}, the value is expected to be
// "inlined". See MVCCPut().
//
// When reading in "skip locked" mode, a key that is locked by a transaction
// other than the reader is not included in the result set and does not result
// in a LockConflictError. Instead, the key is included in the encountered intent
// result parameter so that it can be resolved asynchronously. In this mode, the
// LockTableView provided in the options is consulted any observed key to
// determine whether it is locked with an unreplicated lock.
//
// When reading in "fail on more recent" mode, a WriteTooOldError will be
// returned if the read observes a version with a timestamp above the read
// timestamp. Similarly, a LockConflictError will be returned if the read
// observes another transaction's intent, even if it has a timestamp above
// the read timestamp.
func MVCCGet(
        ctx context.Context, reader Reader, key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions,
) (MVCCGetResult, error) {
        res, _, err := MVCCGetWithValueHeader(ctx, reader, key, timestamp, opts)
        return res, err
}

// MVCCGetForKnownTimestampWithNoIntent returns the value for key@timestamp,
// which is required to exist as a point (not a rangekey) and not have a
// corresponding intent. It returns a tombstone if that is the value at
// key@timestamp. It returns an error if there is no value. The caller should
// set valueInBatch to true if the value is known to be in the batch and does
// not need to be read from the engine (if the batch does not have it).
//
// REQUIRES: batch is an indexed batch.
//
// TODO(sumeer): microbenchmark for valueInBatch={false,true}. There are some
// macro-benchmark numbers using kv0 and changefeeds in
// https://github.com/cockroachdb/cockroach/issues/113090#issuecomment-1782902045.
func MVCCGetForKnownTimestampWithNoIntent(
        ctx context.Context, batch Batch, key roachpb.Key, timestamp hlc.Timestamp, valueInBatch bool,
) (*roachpb.Value, enginepb.MVCCValueHeader, error) {
        var iter MVCCIterator
        var err error
        if valueInBatch {
                iter, err = batch.NewBatchOnlyMVCCIterator(ctx,
                        IterOptions{KeyTypes: IterKeyTypePointsAndRanges, Prefix: true})
        } else {
                // TODO(sumeer): Use Pebble's Get. The value has likely been written
                // recently, so may be in the memtable or L0. A Pebble Get will
                // iteratively go down the levels and find the value in a higher level,
                // which would avoid seeking all the levels. This will not need to handle
                // rangekeys. We won't be able to use mvccGet, that we are using below for
                // convenience. We should also measure that the Get is performant enough
                // to avoid the need to use a batch-only iterator for the valueInBatch
                // case (though using the batch-only iterator allows us to assert that the
                // value was indeed found in the batch).
                iter, err = batch.NewMVCCIterator(ctx, MVCCKeyIterKind,
                        IterOptions{
                                KeyTypes: IterKeyTypePointsAndRanges, Prefix: true, ReadCategory: fs.RangefeedReadCategory})
        }
        if err != nil {
                return nil, enginepb.MVCCValueHeader{}, err
        }
        defer iter.Close()

        // Use mvccGet, even though we know the exact timestamp, since it
        // convenient.
        //
        // mvccGet will expose a rangekey tombstone for key@timesamp, as a
        // point, even though we know key@timestamp must be a point-key. We
        // should stop using mvccGet, which would allow us to assert on this
        // expected behavior.
        value, intent, err := mvccGet(
                ctx, iter, key, timestamp, MVCCGetOptions{Tombstones: true})
        val := value.ToPointer()
        if intent != nil {
                // This is an assertion failure since we constructed the iterators above
                // with MVCCKeyIterKind, so they should not see intents.
                return nil, enginepb.MVCCValueHeader{}, errors.AssertionFailedf(
                        "unexpected intent %v for key %v", intent, key)
        }
        if val == nil {
                return nil, enginepb.MVCCValueHeader{}, errors.Errorf("value missing for key %v", key)
        }
        if val.Timestamp != timestamp {
                return nil, enginepb.MVCCValueHeader{}, errors.Errorf(
                        "expected timestamp %v and found %v for key %v", timestamp, val.Timestamp, key)
        }
        return val, value.MVCCValueHeader, err
}

// MVCCGetWithValueHeader is like MVCCGet, but in addition returns the
// MVCCValueHeader for the value.
func MVCCGetWithValueHeader(
        ctx context.Context, reader Reader, key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions,
) (MVCCGetResult, enginepb.MVCCValueHeader, error) {
        var result MVCCGetResult
        if opts.MaxKeys < 0 || opts.TargetBytes < 0 {
                // Receipt of a GetRequest with negative MaxKeys or TargetBytes indicates
                // that the request was part of a batch that has already exhausted its
                // limit, which means that we should *not* serve the request and return a
                // ResumeSpan for this GetRequest.
                result.ResumeSpan = &roachpb.Span{Key: key}
                if opts.MaxKeys < 0 {
                        result.ResumeReason = kvpb.RESUME_KEY_LIMIT
                } else if opts.TargetBytes < 0 {
                        result.ResumeReason = kvpb.RESUME_BYTE_LIMIT
                }
                return result, enginepb.MVCCValueHeader{}, nil
        }
        iter, err := newMVCCIterator(
                ctx, reader, timestamp, false /* rangeKeyMasking */, opts.DontInterleaveIntents,
                IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        Prefix:       true,
                        ReadCategory: opts.ReadCategory,
                },
        )
        if err != nil {
                return result, enginepb.MVCCValueHeader{}, err
        }
        defer iter.Close()
        value, intent, err := mvccGet(ctx, iter, key, timestamp, opts)
        val := value.ToPointer()
        if err == nil && val != nil {
                // NB: This calculation is different from Scan, since Scan responses include
                // the key/value pair while Get only includes the value.
                numBytes := int64(len(val.RawBytes))
                if opts.TargetBytes > 0 && opts.AllowEmpty && numBytes > opts.TargetBytes {
                        result.ResumeSpan = &roachpb.Span{Key: key}
                        result.ResumeReason = kvpb.RESUME_BYTE_LIMIT
                        result.ResumeNextBytes = numBytes
                        return result, enginepb.MVCCValueHeader{}, nil
                }
                result.NumKeys = 1
                result.NumBytes = numBytes
        }
        result.Value = val
        result.Intent = intent
        return result, value.MVCCValueHeader, err
}

// mvccGet returns an optionalValue containing the MVCCValue for the
// given key (if it exists).
//
// The MVCCValueHeader is included in the returned MVCCValue.
func mvccGet(
        ctx context.Context,
        iter MVCCIterator,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCGetOptions,
) (value optionalValue, intent *roachpb.Intent, err error) {
        if len(key) == 0 {
                return optionalValue{}, nil, emptyKeyError()
        }
        if timestamp.WallTime < 0 {
                return optionalValue{}, nil, errors.Errorf("cannot write to %q at timestamp %s", key, timestamp)
        }
        if util.RaceEnabled && !iter.IsPrefix() {
                return optionalValue{}, nil, errors.AssertionFailedf("mvccGet called with non-prefix iterator")
        }
        if err := opts.validate(); err != nil {
                return optionalValue{}, nil, err
        }

        mvccScanner := pebbleMVCCScannerPool.Get().(*pebbleMVCCScanner)
        defer mvccScanner.release()

        memAccount := mvccScanner.memAccount
        if opts.MemoryAccount != nil {
                memAccount = opts.MemoryAccount
        }

        // MVCCGet is implemented as an MVCCScan where we retrieve a single key. We
        // specify an empty key for the end key which will ensure we don't retrieve a
        // key different than the start key. This is a bit of a hack.
        *mvccScanner = pebbleMVCCScanner{
                parent:            iter,
                memAccount:        memAccount,
                unlimitedMemAcc:   mvccScanner.unlimitedMemAcc,
                lockTable:         opts.LockTable,
                start:             key,
                ts:                timestamp,
                maxKeys:           1,
                inconsistent:      opts.Inconsistent,
                skipLocked:        opts.SkipLocked,
                tombstones:        opts.Tombstones,
                rawMVCCValues:     opts.ReturnRawMVCCValues,
                failOnMoreRecent:  opts.FailOnMoreRecent,
                keyBuf:            mvccScanner.keyBuf,
                decodeMVCCHeaders: true,
        }

        results := &mvccScanner.alloc.pebbleResults
        *results = pebbleResults{}
        mvccScanner.init(opts.Txn, opts.Uncertainty, results)
        mvccScanner.get(ctx)

        // If we're tracking the ScanStats, include the stats from this Get.
        if opts.ScanStats != nil {
                recordIteratorStats(mvccScanner.parent, opts.ScanStats)
                opts.ScanStats.NumGets++
        }

        if mvccScanner.err != nil {
                return optionalValue{}, nil, mvccScanner.err
        }
        intents, err := buildScanIntents(mvccScanner.intentsRepr())
        if err != nil {
                return optionalValue{}, nil, err
        }
        if opts.errOnIntents() && len(intents) > 0 {
                lcErr := &kvpb.LockConflictError{Locks: roachpb.AsLocks(intents)}
                return optionalValue{}, nil, lcErr
        }

        if len(intents) > 1 {
                return optionalValue{}, nil, errors.Errorf("expected 0 or 1 intents, got %d", len(intents))
        } else if len(intents) == 1 {
                intent = &intents[0]
        }

        if len(results.repr) == 0 {
                return optionalValue{}, intent, nil
        }

        mvccKey, rawValue, _, err := MVCCScanDecodeKeyValue(results.repr)
        if err != nil {
                return optionalValue{}, nil, err
        }

        // NB: we may return MVCCValueHeader out of curUnsafeValue because that
        // type does not contain any pointers. A comment on MVCCValueHeader ensures
        // that this stays true.
        value = makeOptionalValue(MVCCValue{Value: roachpb.Value{
                RawBytes:  rawValue,
                Timestamp: mvccKey.Timestamp,
        }, MVCCValueHeader: mvccScanner.curUnsafeValue.MVCCValueHeader})

        return value, intent, nil
}

// MVCCGetAsTxn constructs a temporary transaction from the given transaction
// metadata and calls MVCCGet as that transaction. This method is required
// only for reading intents of a transaction when only its metadata is known
// and should rarely be used.
//
// The read is carried out without the chance of uncertainty restarts.
func MVCCGetAsTxn(
        ctx context.Context,
        reader Reader,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        txnMeta enginepb.TxnMeta,
) (MVCCGetResult, error) {
        return MVCCGet(ctx, reader, key, timestamp, MVCCGetOptions{
                Txn: &roachpb.Transaction{
                        TxnMeta:                txnMeta,
                        Status:                 roachpb.PENDING,
                        ReadTimestamp:          txnMeta.WriteTimestamp,
                        GlobalUncertaintyLimit: txnMeta.WriteTimestamp,
                }})
}

// mvccGetMetadata returns or reconstructs the meta key for the given key.
// A prefix scan using the iterator is performed, resulting in one of the
// following successful outcomes:
//
//  1. iterator finds nothing; returns (false, 0, 0, nil).
//  2. iterator finds an explicit meta key; unmarshals and returns its size.
//     ok is set to true.
//  3. iterator finds a value, i.e. the meta key is implicit.
//     In this case, it accounts for the size of the key with the portion
//     of the user key found which is not the MVCC timestamp suffix (since
//     that is the usual contribution of the meta key). The value size returned
//     will be zero, as there is no stored MVCCMetadata.
//     ok is set to true.
//  4. iterator finds an MVCC range tombstone above a value. In this case,
//     metadata for a synthetic point tombstone is returned.
//
// The timestamp where the real point key last changed is also returned, if a
// real point key was found. This may differ from the metadata timestamp when a
// point key is covered by multiple MVCC range tombstones (in which case the
// point key disappeared at the _lowest_ range tombstone above it), or when a
// point tombstone is covered by a range tombstone (in which case the point key
// disappeared at the point tombstone). It is needed to correctly account for
// the GCBytesAge contribution of the key prefix, which is not affected by MVCC
// range tombstones, and would be incorrect if we used the synthetic point
// tombstone of the newest MVCC range tombstone instead.
//
// The passed in MVCCMetadata must not be nil. If the supplied iterator is nil,
// no seek operation is performed. This is used by the Blind{Put,ConditionalPut}
// operations to avoid seeking when the metadata is known not to exist.
func mvccGetMetadata(
        iter MVCCIterator, metaKey MVCCKey, meta *enginepb.MVCCMetadata,
) (ok bool, keyBytes, valBytes int64, realKeyChanged hlc.Timestamp, err error) {
        if iter == nil {
                return false, 0, 0, hlc.Timestamp{}, nil
        }
        iter.SeekGE(metaKey)
        if ok, err = iter.Valid(); !ok {
                return false, 0, 0, hlc.Timestamp{}, err
        }
        unsafeKey := iter.UnsafeKey()
        if !unsafeKey.Key.Equal(metaKey.Key) {
                return false, 0, 0, hlc.Timestamp{}, nil
        }
        hasPoint, hasRange := iter.HasPointAndRange()

        // Check for existing intent metadata. Intents will be emitted colocated with
        // a covering range key when seeking to it, and always located above range
        // keys, so we don't need to check for range keys here.
        if hasPoint && !unsafeKey.IsValue() {
                if err := iter.ValueProto(meta); err != nil {
                        return false, 0, 0, hlc.Timestamp{}, err
                }
                return true, int64(unsafeKey.EncodedSize()), int64(iter.ValueLen()),
                        meta.Timestamp.ToTimestamp(), nil
        }

        // Synthesize point key metadata. For values, the size of keys is always
        // accounted for as MVCCVersionTimestampSize. The size of the metadata key is
        // accounted for separately.
        meta.Reset()
        meta.KeyBytes = MVCCVersionTimestampSize

        // If we land on a (bare) range key, step to look for a colocated point key.
        if hasRange && !hasPoint {
                rkTimestamp := iter.RangeKeys().Versions[0].Timestamp

                iter.Next()
                if ok, err = iter.Valid(); err != nil {
                        return false, 0, 0, hlc.Timestamp{}, err
                } else if ok {
                        // NB: For !ok, hasPoint is already false.
                        hasPoint, hasRange = iter.HasPointAndRange()
                        unsafeKey = iter.UnsafeKey()
                }
                // If only a bare range tombstone was found at the seek key, synthesize
                // point tombstone metadata for it. realKeyChanged is empty since there
                // was no real point key here.
                if !hasPoint || !unsafeKey.Key.Equal(metaKey.Key) {
                        meta.Deleted = true
                        meta.Timestamp = rkTimestamp.ToLegacyTimestamp()
                        return true, 0, 0, hlc.Timestamp{}, nil
                }
        }

        // We're now on a point key.
        valueLen, isTombstone, err := iter.MVCCValueLenAndIsTombstone()
        if err != nil {
                return false, 0, 0, hlc.Timestamp{}, err
        }

        // Check if the point key is covered by an MVCC range tombstone, and
        // synthesize point tombstone metadata for it in that case. realKeyChanged is
        // set to the timestamp where the point key ceased to exist -- either the
        // lowest range tombstone above the key (not the highest which is used for
        // metadata), or the point version's timestamp if it was a tombstone.
        if hasRange {
                rangeKeys := iter.RangeKeys()
                if v, ok := rangeKeys.FirstAtOrAbove(unsafeKey.Timestamp); ok {
                        meta.Deleted = true
                        meta.Timestamp = rangeKeys.Versions[0].Timestamp.ToLegacyTimestamp()
                        keyLastSeen := v.Timestamp
                        if isTombstone {
                                keyLastSeen = unsafeKey.Timestamp
                        }
                        return true, int64(EncodedMVCCKeyPrefixLength(metaKey.Key)), 0, keyLastSeen, nil
                }
        }

        // Synthesize metadata for a regular point key.
        meta.ValBytes = int64(valueLen)
        meta.Deleted = isTombstone
        meta.Timestamp = unsafeKey.Timestamp.ToLegacyTimestamp()

        return true, int64(EncodedMVCCKeyPrefixLength(metaKey.Key)), 0, unsafeKey.Timestamp, nil
}

// putBuffer holds pointer data needed by mvccPutInternal. Bundling
// this data into a single structure reduces memory
// allocations. Managing this temporary buffer using a sync.Pool
// completely eliminates allocation from the put common path.
type putBuffer struct {
        meta     enginepb.MVCCMetadata
        newMeta  enginepb.MVCCMetadata
        ts       hlc.LegacyTimestamp // avoids heap allocations
        ltKeyBuf []byte              // avoids heap allocations
        metaBuf  []byte              // avoids heap allocations
}

var putBufferPool = sync.Pool{
        New: func() interface{} {
                return &putBuffer{}
        },
}

func newPutBuffer() *putBuffer {
        return putBufferPool.Get().(*putBuffer)
}

func (b *putBuffer) release() {
        *b = putBuffer{ltKeyBuf: b.ltKeyBuf[:0], metaBuf: b.metaBuf[:0]}
        putBufferPool.Put(b)
}

func (b *putBuffer) lockTableKey(
        key roachpb.Key, str lock.Strength, txnID uuid.UUID,
) (ltEngKey EngineKey, keyBytes int64) {
        ltKey := LockTableKey{
                Key:      key,
                Strength: str,
                TxnUUID:  txnID,
        }
        ltEngKey, b.ltKeyBuf = ltKey.ToEngineKey(b.ltKeyBuf)
        keyBytes = ltKey.EncodedSize()
        return ltEngKey, keyBytes
}

func (b *putBuffer) marshalMeta(meta *enginepb.MVCCMetadata) (_ []byte, err error) {
        size := meta.Size()
        data := b.metaBuf
        if cap(data) < size {
                data = make([]byte, size)
        } else {
                data = data[:size]
        }
        n, err := protoutil.MarshalToSizedBuffer(meta, data)
        if err != nil {
                return nil, err
        }
        b.metaBuf = data
        return data[:n], nil
}

func (b *putBuffer) putInlineMeta(
        writer Writer, key MVCCKey, meta *enginepb.MVCCMetadata,
) (keyBytes, valBytes int64, err error) {
        bytes, err := b.marshalMeta(meta)
        if err != nil {
                return 0, 0, err
        }
        if err := writer.PutUnversioned(key.Key, bytes); err != nil {
                return 0, 0, err
        }
        return int64(key.EncodedSize()), int64(len(bytes)), nil
}

var trueValue = true

// putLockMeta puts a lock at the given key with the provided strength and
// value.
func (b *putBuffer) putLockMeta(
        writer Writer,
        key roachpb.Key,
        str lock.Strength,
        meta *enginepb.MVCCMetadata,
        alreadyExists bool,
) (keyBytes, valBytes int64, err error) {
        if meta.Timestamp.ToTimestamp() != meta.Txn.WriteTimestamp {
                // The timestamps are supposed to be in sync. If they weren't, it wouldn't
                // be clear for readers which one to use for what.
                return 0, 0, errors.AssertionFailedf(
                        "meta.Timestamp != meta.Txn.WriteTimestamp: %s != %s", meta.Timestamp, meta.Txn.WriteTimestamp)
        }
        lockTableKey, lockTableKeyBytes := b.lockTableKey(key, str, meta.Txn.ID)
        if alreadyExists {
                // Absence represents false.
                meta.TxnDidNotUpdateMeta = nil
        } else {
                meta.TxnDidNotUpdateMeta = &trueValue
        }
        bytes, err := b.marshalMeta(meta)
        if err != nil {
                return 0, 0, err
        }
        if err = writer.PutEngineKey(lockTableKey, bytes); err != nil {
                return 0, 0, err
        }
        if str == lock.Intent {
                // For historical reasons, intent metadata key-values use the encoded
                // size of the unversioned MVCCKey that they are virtualized at (e.g. by
                // the intentInterleavingIter) as their contribution to stats, instead
                // of their real size in the lock table keyspace.
                keyBytes = int64(MakeMVCCMetadataKey(key).EncodedSize())
        } else {
                keyBytes = lockTableKeyBytes
        }
        valBytes = int64(len(bytes))
        return keyBytes, valBytes, nil
}

// clearLockMeta clears a lock at the given key and strength.
//
// txnDidNotUpdateMeta allows for performance optimization when set to true, and
// has semantics defined in MVCCMetadata.TxnDidNotUpdateMeta (it can be
// conservatively set to false).
//
// TODO(sumeer): after the full transition to separated locks, measure the cost
// of a putLockMeta implementation, where there is an existing intent, that does
// a <single-clear, put> pair. If there isn't a performance decrease, we can
// stop tracking txnDidNotUpdateMeta and still optimize clearLockMeta by always
// doing single-clear.
func (b *putBuffer) clearLockMeta(
        writer Writer,
        key roachpb.Key,
        str lock.Strength,
        txnDidNotUpdateMeta bool,
        txnUUID uuid.UUID,
        opts ClearOptions,
) (keyBytes, valBytes int64, err error) {
        lockTableKey, lockTableKeyBytes := b.lockTableKey(key, str, txnUUID)
        if txnDidNotUpdateMeta {
                err = writer.SingleClearEngineKey(lockTableKey)
        } else {
                err = writer.ClearEngineKey(lockTableKey, opts)
        }
        if str == lock.Intent {
                // See comment in putLockMeta.
                keyBytes = int64(MakeMVCCMetadataKey(key).EncodedSize())
        } else {
                keyBytes = lockTableKeyBytes
        }
        valBytes = 0 // cleared
        return keyBytes, valBytes, err
}

// MVCCPut sets the value for a specified key. It will save the value
// with different versions according to its timestamp and update the
// key metadata. The timestamp must be passed as a parameter; using
// the Timestamp field on the value results in an error.
//
// Note that, when writing transactionally, the txn's timestamps
// dictate the timestamp of the operation, and the timestamp parameter is
// confusing and redundant. See the comment on mvccPutInternal for details.
//
// If the timestamp is specified as hlc.Timestamp{}, the value is
// inlined instead of being written as a timestamp-versioned value. A
// zero timestamp write to a key precludes a subsequent write using a
// non-zero timestamp and vice versa. Inlined values require only a
// single row and never accumulate more than a single value. Successive
// zero timestamp writes to a key replace the value and deletes clear
// the value. In addition, zero timestamp values may be merged.
func MVCCPut(
        ctx context.Context,
        rw ReadWriter,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        opts MVCCWriteOptions,
) (roachpb.LockAcquisition, error) {
        // If we're not tracking stats for the key and we're writing a non-versioned
        // key we can utilize a blind put to avoid reading any existing value.
        var iter MVCCIterator
        var ltScanner *lockTableKeyScanner
        blind := opts.Stats == nil && timestamp.IsEmpty()
        if !blind {
                var err error
                iter, err = newMVCCIterator(
                        ctx, rw, timestamp, false /* rangeKeyMasking */, true, /* noInterleavedIntents */
                        IterOptions{
                                KeyTypes:     IterKeyTypePointsAndRanges,
                                Prefix:       true,
                                ReadCategory: opts.Category,
                        },
                )
                if err != nil {
                        return roachpb.LockAcquisition{}, err
                }
                defer iter.Close()

                inlinePut := timestamp.IsEmpty()
                if !inlinePut {
                        ltScanner, err = newLockTableKeyScanner(
                                ctx, rw, opts.TxnID(), lock.Intent, opts.MaxLockConflicts, opts.TargetLockConflictBytes, opts.Category)
                        if err != nil {
                                return roachpb.LockAcquisition{}, err
                        }
                        defer ltScanner.close()
                }
        }
        return mvccPutUsingIter(ctx, rw, iter, ltScanner, key, timestamp, value, nil, opts)
}

// MVCCBlindPut is a fast-path of MVCCPut. See the MVCCPut comments for details
// of the semantics. MVCCBlindPut skips retrieving the existing metadata for
// the key requiring the caller to guarantee no versions for the key currently
// exist in order for stats to be updated properly. If a previous version of
// the key does exist it is up to the caller to properly account for their
// existence in updating the stats.
//
// Note that, when writing transactionally, the txn's timestamps
// dictate the timestamp of the operation, and the timestamp parameter is
// confusing and redundant. See the comment on mvccPutInternal for details.
func MVCCBlindPut(
        ctx context.Context,
        writer Writer,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        opts MVCCWriteOptions,
) (roachpb.LockAcquisition, error) {
        return mvccPutUsingIter(ctx, writer, nil, nil, key, timestamp, value, nil, opts)
}

// MVCCDelete marks the key deleted so that it will not be returned in
// future get responses.
//
// Note that, when writing transactionally, the txn's timestamps
// dictate the timestamp of the operation, and the timestamp parameter is
// confusing and redundant. See the comment on mvccPutInternal for details.
//
// foundKey indicates whether the key that was passed in had a value already in
// the database.
func MVCCDelete(
        ctx context.Context,
        rw ReadWriter,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCWriteOptions,
) (foundKey bool, _ roachpb.LockAcquisition, err error) {
        iter, err := newMVCCIterator(
                ctx, rw, timestamp, false /* rangeKeyMasking */, true, /* noInterleavedIntents */
                IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        Prefix:       true,
                        ReadCategory: opts.Category,
                },
        )
        if err != nil {
                return false, roachpb.LockAcquisition{}, err
        }
        defer iter.Close()

        inlineDelete := timestamp.IsEmpty()
        var ltScanner *lockTableKeyScanner
        if !inlineDelete {
                ltScanner, err = newLockTableKeyScanner(
                        ctx, rw, opts.TxnID(), lock.Intent, opts.MaxLockConflicts, opts.TargetLockConflictBytes, opts.Category)
                if err != nil {
                        return false, roachpb.LockAcquisition{}, err
                }
                defer ltScanner.close()
        }
        buf := newPutBuffer()
        defer buf.release()

        // TODO(yuzefovich): can we avoid the put if the key does not exist?
        return mvccPutInternal(
                ctx, rw, iter, ltScanner, key, timestamp, noValue, buf, nil, opts)
}

var noValue = roachpb.Value{}

// mvccPutUsingIter sets the value for a specified key using the provided
// MVCCIterator. The function takes a value and a valueFn, only one of which
// should be provided. If the valueFn is nil, value's raw bytes will be set
// for the key, else the bytes provided by the valueFn will be used.
func mvccPutUsingIter(
        ctx context.Context,
        writer Writer,
        iter MVCCIterator,
        ltScanner *lockTableKeyScanner,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        valueFn func(optionalValue) (roachpb.Value, error),
        opts MVCCWriteOptions,
) (roachpb.LockAcquisition, error) {
        buf := newPutBuffer()
        defer buf.release()

        // Most callers don't care about the returned exReplaced value. The ones that
        // do can call mvccPutInternal directly.
        _, acq, err := mvccPutInternal(
                ctx, writer, iter, ltScanner, key, timestamp, value, buf, valueFn, opts)
        return acq, err
}

// MVCCScanDecodeKeyValue decodes a key/value pair returned in an MVCCScan
// "batch" (this is not the RocksDB batch repr format), returning both the
// key/value and the suffix of data remaining in the batch.
func MVCCScanDecodeKeyValue(repr []byte) (key MVCCKey, value []byte, orepr []byte, err error) {
        k, ts, value, orepr, err := enginepb.ScanDecodeKeyValue(repr)
        return MVCCKey{k, ts}, value, orepr, err
}

// MVCCScanDecodeKeyValues decodes all key/value pairs returned in one or more
// MVCCScan "batches" (this is not the RocksDB batch repr format). The provided
// function is called for each key/value pair.
func MVCCScanDecodeKeyValues(repr [][]byte, fn func(key MVCCKey, rawBytes []byte) error) error {
        return enginepb.ScanDecodeKeyValues(repr, func(k []byte, ts hlc.Timestamp, rawBytes []byte) error {
                return fn(MVCCKey{k, ts}, rawBytes)
        })
}

// replayTransactionalWrite performs a transactional write under the assumption
// that the transactional write was already executed before. Essentially a replay.
// Since transactions should be idempotent, we must be particularly careful
// about writing an intent if it was already written. The function is called
// when the sequence of the transaction is at or below one found in `meta.Txn`,
// so we assert the value we're trying to add against the value that was
// previously written at that sequence.
//
// 1) Firstly, we find the value previously written as part of the same sequence.
// 2) We then figure out the value the transaction is trying to write (either the
// value itself or the valueFn applied to the right previous value).
// 3) We assert that the transactional write is idempotent.
//
// Ensure all intents are found and that the values are always accurate for
// transactional idempotency.
func replayTransactionalWrite(
        ctx context.Context,
        iter MVCCIterator,
        meta *enginepb.MVCCMetadata,
        key roachpb.Key,
        value roachpb.Value,
        txn *roachpb.Transaction,
        valueFn func(optionalValue) (roachpb.Value, error),
        replayWriteTimestampProtection bool,
) error {
        var writtenValue optionalValue
        var err error
        if txn.Sequence == meta.Txn.Sequence {
                // This is a special case. This is when the intent hasn't made it
                // to the intent history yet. We must now assert the value written
                // in the intent to the value we're trying to write.
                writtenValue, _, err = mvccGet(ctx, iter, key, meta.Timestamp.ToTimestamp(), MVCCGetOptions{
                        // NOTE: we pass Txn here to ensure that this read succeeds even if
                        // iter is interleaving intents. This is not needed if iter is a raw
                        // MVCCIterator.
                        Txn:        txn,
                        Tombstones: true,
                })
                if err != nil {
                        return err
                }
        } else {
                // Get the value from the intent history.
                if intentValRaw, ok := meta.GetIntentValue(txn.Sequence); ok {
                        intentVal, err := DecodeMVCCValue(intentValRaw)
                        if err != nil {
                                return err
                        }
                        writtenValue = makeOptionalValue(intentVal)
                }
        }
        if !writtenValue.exists {
                // NB: This error may be due to a batched `DelRange` operation that, upon being replayed, finds a new key to delete.
                // See issue #71236 for more explanation.
                err := errors.AssertionFailedf("transaction %s with sequence %d missing an intent with lower sequence %d",
                        txn.ID, meta.Txn.Sequence, txn.Sequence)
                errWithIssue := errors.WithIssueLink(err,
                        errors.IssueLink{
                                IssueURL: build.MakeIssueURL(71236),
                                Detail: "This error may be caused by `DelRange` operation in a batch that also contains a " +
                                        "write on an intersecting key, as in the case the other write hits a `WriteTooOld` " +
                                        "error, it is possible for the `DelRange` operation to pick up a new key to delete on " +
                                        "replay, which will cause sanity checks of intent history to fail as the first iteration " +
                                        "of the operation would not have placed an intent on this new key.",
                        })
                return errWithIssue
        }

        // If the valueFn is specified, we must apply it to the would-be value at the key.
        if valueFn != nil {
                var exVal optionalValue

                // If there's an intent history, use that.
                prevIntent, prevValueWritten := meta.GetPrevIntentSeq(txn.Sequence, txn.IgnoredSeqNums)
                if prevValueWritten {
                        // If the previous value was found in the IntentHistory,
                        // simply apply the value function to the historic value
                        // to get the would-be value.
                        prevIntentVal, err := DecodeMVCCValue(prevIntent.Value)
                        if err != nil {
                                return err
                        }
                        exVal = makeOptionalValue(prevIntentVal)
                } else {
                        // If the previous value at the key wasn't written by this
                        // transaction, or it was hidden by a rolled back seqnum, we look at
                        // last committed value on the key. Since we want the last committed
                        // value on the key, we read below our previous intents here.
                        metaTimestamp := meta.Timestamp.ToTimestamp()
                        val, _, err := mvccGet(ctx, iter, key, metaTimestamp.Prev(), MVCCGetOptions{
                                Tombstones: true,
                        })
                        if err != nil {
                                return err
                        }
                        exVal = val
                }

                value, err = valueFn(exVal)
                if err != nil {
                        return err
                }
        }

        // To ensure the transaction is idempotent, we must assert that the
        // calculated value on this replay is the same as the one we've previously
        // written.
        if !bytes.Equal(value.RawBytes, writtenValue.Value.RawBytes) {
                return errors.AssertionFailedf("transaction %s with sequence %d has a different value %+v after recomputing from what was written: %+v",
                        txn.ID, txn.Sequence, value.RawBytes, writtenValue.Value.RawBytes)
        }

        // If ambiguous replay protection is enabled, a replay that changes the
        // timestamp should fail, as this would break idempotency (see #103817).
        if replayWriteTimestampProtection && !txn.WriteTimestamp.Equal(meta.Txn.WriteTimestamp) {
                return errors.Errorf("transaction %s with sequence %d prevented from changing "+
                        "write timestamp from %s to %s due to ambiguous replay protection",
                        txn.ID, txn.Sequence, meta.Txn.WriteTimestamp, txn.WriteTimestamp)
        }

        return nil
}

// mvccPutInternal adds a new timestamped value to the specified key.
// If value is nil, creates a deletion tombstone value. valueFn is
// an optional alternative to supplying value directly. It is passed
// the existing value (or nil if none exists) and returns the value
// to write or an error. If valueFn is supplied, value should be nil
// and vice versa. valueFn can delete by returning nil. Returning
// []byte{} will write an empty value, not delete.
//
// The returned boolean indicates whether the put replaced an existing live key
// (including one written previously by the same transaction). This is evaluated
// at the transaction's read timestamp.
// TODO(erikgrinaker): This return value exists solely because valueFn incurs an
// additional seek via maybeGetValue(). In most cases we have already read the
// value via other means, e.g. mvccGetMetadata(). We should restructure the code
// such that valueFn omits unnecessary reads in the common case and then use it
// rather than the returned boolean where needed. See also:
// https://github.com/cockroachdb/cockroach/issues/90609
//
// A lock acquisition is returned in cases where a new lock is acquired (by
// virtue of writing an intent). Puts that don't write intents (such as inline
// writes) or puts that are replays (so no new acquisition to speak of) do not
// return lock acquisitions. The caller may return the lock acquisition struct
// further up the stack so that lock tracking in the lock table is correctly
// updated.
//
// The given iter must surface range keys to correctly account for
// MVCC range tombstones in MVCC stats.
//
// Note that, when writing transactionally, the txn's timestamps
// dictate the timestamp of the operation, and the timestamp parameter
// is redundant. Specifically, the intent is written at the txn's
// provisional commit timestamp, txn.WriteTimestamp, unless it is
// forwarded by an existing committed value above that timestamp.
// However, reads (e.g., for a ConditionalPut) are performed at the
// txn's read timestamp (txn.ReadTimestamp) to ensure that the
// client sees a consistent snapshot of the database. Any existing
// committed writes that are newer than the read timestamp will thus
// generate a WriteTooOld error.
//
// In an attempt to reduce confusion about which timestamp applies, when writing
// transactionally, the timestamp parameter must be equal to the transaction's
// read timestamp. (One could imagine instead requiring that the timestamp
// parameter be set to hlc.Timestamp{} when writing transactionally, but
// hlc.Timestamp{} is already used as a sentinel for inline puts.)
//
// The opts.LocalTimestamp parameter dictates the local clock timestamp
// assigned to the key-value. It should be taken from the local HLC
// clock on the leaseholder that is performing the write and must be
// below the leaseholder's lease expiration. If the supplied local
// timestamp is empty (hlc.ClockTimestamp{}), the value will not be
// assigned an explicit local timestamp. The effect of this is that
// readers treat the local timestamp as being equal to the version
// timestamp.
func mvccPutInternal(
        ctx context.Context,
        writer Writer,
        iter MVCCIterator,
        ltScanner *lockTableKeyScanner,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        buf *putBuffer,
        valueFn func(optionalValue) (roachpb.Value, error),
        opts MVCCWriteOptions,
) (bool, roachpb.LockAcquisition, error) {
        if len(key) == 0 {
                return false, roachpb.LockAcquisition{}, emptyKeyError()
        }
        if timestamp.WallTime < 0 {
                return false, roachpb.LockAcquisition{}, errors.Errorf("cannot write to %q at timestamp %s", key, timestamp)
        }
        if !value.Timestamp.IsEmpty() {
                return false, roachpb.LockAcquisition{}, errors.Errorf("cannot have timestamp set in value")
        }

        if err := opts.validate(); err != nil {
                return false, roachpb.LockAcquisition{}, err
        }

        putIsBlind := iter == nil
        putIsInline := timestamp.IsEmpty()
        if (putIsBlind || putIsInline) != (ltScanner == nil) {
                if ltScanner == nil {
                        return false, roachpb.LockAcquisition{}, errors.Errorf(
                                "ltScanner must be non-nil for putIsBlind %t, putIsInline %t", putIsBlind, putIsInline)
                } else {
                        return false, roachpb.LockAcquisition{}, errors.Errorf(
                                "ltScanner must be nil for putIsBlind %t, putIsInline %t", putIsBlind, putIsInline)
                }

        }

        metaKey := MakeMVCCMetadataKey(key)
        var ok bool
        var meta *enginepb.MVCCMetadata
        var origMetaKeySize, origMetaValSize int64
        var origRealKeyChanged hlc.Timestamp
        var err error
        if !putIsBlind {
                // If the put is not blind, scan the MVCC keyspace to find the latest
                // existing version, if any, and synthesize an MVCCMetadata value if
                // necessary.
                ok, origMetaKeySize, origMetaValSize, origRealKeyChanged, err =
                        mvccGetMetadata(iter, metaKey, &buf.meta)
                if err != nil {
                        return false, roachpb.LockAcquisition{}, err
                }
                meta = &buf.meta
                // Verify we're not mixing inline and non-inline values.
                if ok && putIsInline != meta.IsInline() {
                        return false, roachpb.LockAcquisition{}, errors.Errorf("%q: put is inline=%t, but existing value is inline=%t",
                                metaKey, putIsInline, meta.IsInline())
                }
                if ok && !meta.IsInline() {
                        // INVARIANTS:
                        //   !putIsBlind
                        //   !meta.IsInline()
                        //   !meta.IsInline() => !putIsInline (due to previous if-block)
                        //   !putIsInline && !putIsBlind => ltScanner != nil (due to error check earlier in function)
                        //   So we can use ltScanner safely.
                        //
                        // If at least one version is found, scan the lock table for conflicting
                        // locks and/or an intent on the key from different transactions. If any
                        // such conflicts are found, the lock table scanner will return a
                        // LockConflictError.
                        //
                        // We only need to scan the lock table if we find at least one version.
                        // This is because locks cannot be acquired on non-existent keys. This
                        // constraint permits an important performance optimization — writes to
                        // non-existent keys only perform a single seek (of the MVCC keyspace) and
                        // no second seek (of the lock table keyspace).
                        err = ltScanner.scan(key)
                        if err != nil {
                                return false, roachpb.LockAcquisition{}, err
                        }

                        // If the lock table scan found the writing transaction's own intent,
                        // use it as the MVCCMetadata value for this key.
                        if intentMeta := ltScanner.foundOwn(lock.Intent); intentMeta != nil {
                                meta = intentMeta
                                origMetaKeySize = int64(metaKey.EncodedSize())
                                origMetaValSize = int64(intentMeta.Size())
                                origRealKeyChanged = intentMeta.Timestamp.ToTimestamp()
                        }
                }
        }

        // Handle inline put. No IntentHistory is required for inline writes as they
        // aren't allowed within transactions. MVCC range tombstones cannot exist
        // across them either.
        if putIsInline {
                if opts.Txn != nil {
                        return false, roachpb.LockAcquisition{}, errors.Errorf("%q: inline writes not allowed within transactions", metaKey)
                }
                if valueFn != nil {
                        var inlineVal optionalValue
                        if ok {
                                inlineVal = makeOptionalValue(MVCCValue{Value: roachpb.Value{RawBytes: meta.RawBytes}})
                        }
                        if value, err = valueFn(inlineVal); err != nil {
                                return false, roachpb.LockAcquisition{}, err
                        }
                }
                var metaKeySize, metaValSize int64
                if !value.IsPresent() {
                        metaKeySize, metaValSize, err = 0, 0, writer.ClearUnversioned(metaKey.Key, ClearOptions{
                                // NB: origMetaValSize is only populated by mvccGetMetadata if
                                // iter != nil.
                                ValueSizeKnown: iter != nil,
                                ValueSize:      uint32(origMetaValSize),
                        })
                } else {
                        buf.newMeta = enginepb.MVCCMetadata{RawBytes: value.RawBytes}
                        metaKeySize, metaValSize, err = buf.putInlineMeta(writer, metaKey, &buf.newMeta)
                }
                if opts.Stats != nil {
                        updateStatsForInline(opts.Stats, key, origMetaKeySize, origMetaValSize, metaKeySize, metaValSize)
                }
                // NB: Inline puts are non-transactional, so return an empty lock
                // acquisition.
                return ok && !meta.Deleted, roachpb.LockAcquisition{}, err
        }

        // Determine the read and write timestamps for the write. For a
        // non-transactional write, these will be identical. For a transactional
        // write, we read at the transaction's read timestamp but write intents at its
        // provisional commit timestamp. See the comment on the txn.WriteTimestamp field
        // definition for rationale.
        readTimestamp := timestamp
        writeTimestamp := timestamp
        if opts.Txn != nil {
                readTimestamp = opts.Txn.ReadTimestamp
                if readTimestamp != timestamp {
                        return false, roachpb.LockAcquisition{}, errors.AssertionFailedf(
                                "mvccPutInternal: txn's read timestamp %s does not match timestamp %s",
                                readTimestamp, timestamp)
                }
                writeTimestamp = opts.Txn.WriteTimestamp
        }

        timestamp = hlc.Timestamp{} // prevent accidental use below

        // Determine what the logical operation is. Are we writing an intent
        // or a value directly?
        logicalOp := MVCCWriteValueOpType
        if opts.Txn != nil {
                logicalOp = MVCCWriteIntentOpType
        }

        var prevIsValue bool
        var prevValSize int64
        var exReplaced bool
        var newIntentHistory []enginepb.MVCCMetadata_SequencedIntent
        if ok {
                // There is existing metadata for this key; ensure our write is permitted.
                metaTimestamp := meta.Timestamp.ToTimestamp()
                exReplaced = !meta.Deleted

                // Handle intents. MVCC range tombstones should not require any special
                // handling, since they cannot be transactional.
                if meta.Txn != nil {
                        // There is an uncommitted write intent.
                        if opts.Txn == nil || meta.Txn.ID != opts.Txn.ID {
                                // The current Put operation does not come from the same
                                // transaction. This should have been caught above by the
                                // lockTableKeyScanner.
                                return false, roachpb.LockAcquisition{}, errors.AssertionFailedf("conflicting intent %s,"+
                                        " should have been detected by lock table scan", meta)
                        } else if opts.Txn.Epoch < meta.Txn.Epoch {
                                return false, roachpb.LockAcquisition{}, errors.Errorf("put with epoch %d came after put with epoch %d in txn %s",
                                        opts.Txn.Epoch, meta.Txn.Epoch, opts.Txn.ID)
                        } else if opts.Txn.Epoch == meta.Txn.Epoch && opts.Txn.Sequence <= meta.Txn.Sequence {
                                // The transaction has executed at this sequence before. This is merely a
                                // replay of the transactional write. Assert that all is in order and return
                                // early.
                                //
                                // NB: No new lock is acquired by replaying this transactional write, so
                                // we return an empty lock acquisition struct. Doing so allows us to
                                // side-step questions about the timestamp at which the replayed intent
                                // is written, as a replayed write is not moved to the supplied
                                // transaction's write timestamp in cases where it was originally
                                // written at a lower timestamp.
                                return false,
                                        roachpb.LockAcquisition{},
                                        replayTransactionalWrite(ctx, iter, meta, key, value,
                                                opts.Txn, valueFn, opts.ReplayWriteTimestampProtection,
                                        )
                        }

                        // We're overwriting the intent that was present at this key, before we do
                        // that though - we must record the older value in the IntentHistory.
                        oldVersionKey := metaKey
                        oldVersionKey.Timestamp = metaTimestamp

                        // But where to find the older value? There are 4 cases:
                        // - last write inside txn, same epoch, seqnum of last write is not
                        //   ignored: value at key.
                        //   => read the value associated with the intent with consistent
                        //   mvccGetInternal(). (This is the common case.)
                        // - last write inside txn, same epoch, seqnum of last write is ignored:
                        //   cannot use value at key.
                        //   => try reading from intent history.
                        //   => if all intent history entries are rolled back, fall back to last
                        //      case below.
                        // - last write outside txn or at different epoch: use inconsistent
                        //   mvccGetInternal, which will find it outside.
                        //
                        // (Note that _some_ value is guaranteed to be found, as indicated by
                        // ok == true above. The one notable exception is when there are no past
                        // committed values, and all past writes by this transaction have been
                        // rolled back, either due to transaction retries or transaction savepoint
                        // rollbacks.)
                        var exVal optionalValue
                        // Set when the current provisional value is not ignored due to a txn
                        // restart or a savepoint rollback. Represents an encoded MVCCValue.
                        var curProvValRaw []byte
                        if opts.Txn.Epoch == meta.Txn.Epoch /* last write inside txn */ {
                                if !enginepb.TxnSeqIsIgnored(meta.Txn.Sequence, opts.Txn.IgnoredSeqNums) {
                                        // Seqnum of last write is not ignored. Retrieve the value.
                                        iter.SeekGE(oldVersionKey)
                                        var hasPoint bool
                                        if valid, err := iter.Valid(); err != nil {
                                                return false, roachpb.LockAcquisition{}, err
                                        } else if valid {
                                                hasPoint, _ = iter.HasPointAndRange()
                                        }
                                        if !hasPoint || !iter.UnsafeKey().Equal(oldVersionKey) {
                                                return false, roachpb.LockAcquisition{}, errors.Errorf("existing intent value missing: %s", oldVersionKey)
                                        }

                                        // NOTE: we use Value instead of UnsafeValue so that we can move the
                                        // iterator below without invalidating this byte slice.
                                        curProvValRaw, err = iter.Value()
                                        if err != nil {
                                                return false, roachpb.LockAcquisition{}, err
                                        }
                                        curIntentVal, err := DecodeMVCCValue(curProvValRaw)
                                        if err != nil {
                                                return false, roachpb.LockAcquisition{}, err
                                        }
                                        exVal = makeOptionalValue(curIntentVal)
                                } else {
                                        // Seqnum of last write was ignored. Try retrieving the value from the history.
                                        prevIntent, prevIntentOk := meta.GetPrevIntentSeq(opts.Txn.Sequence, opts.Txn.IgnoredSeqNums)
                                        if prevIntentOk {
                                                prevIntentVal, err := DecodeMVCCValue(prevIntent.Value)
                                                if err != nil {
                                                        return false, roachpb.LockAcquisition{}, err
                                                }
                                                exVal = makeOptionalValue(prevIntentVal)
                                        }
                                }
                        }
                        if !exVal.exists {
                                // "last write inside txn && seqnum of all writes are ignored"
                                // OR
                                // "last write outside txn"
                                // => use inconsistent mvccGetInternal to retrieve the last committed value at key.
                                //
                                // Since we want the last committed value on the key, we must
                                // read below our previous intents here.
                                optVal, _, err := mvccGet(ctx, iter, key, metaTimestamp.Prev(), MVCCGetOptions{
                                        Tombstones:   true,
                                        ReadCategory: opts.Category,
                                })
                                if err != nil {
                                        return false, roachpb.LockAcquisition{}, err
                                }
                                exVal = optVal
                        }

                        exReplaced = exVal.IsPresent()

                        // Make sure we process valueFn before clearing any earlier
                        // version. For example, a conditional put within same
                        // transaction should read previous write.
                        if valueFn != nil {
                                value, err = valueFn(exVal)
                                if err != nil {
                                        return false, roachpb.LockAcquisition{}, err
                                }
                        }

                        // We are replacing our own write intent. If we are writing at
                        // the same timestamp (see comments in else block) we can
                        // overwrite the existing intent; otherwise we must manually
                        // delete the old intent, taking care with MVCC stats.
                        logicalOp = MVCCUpdateIntentOpType
                        if metaTimestamp.Less(writeTimestamp) {
                                {
                                        // If the older write intent has a version underneath it, we need to
                                        // read its size because its GCBytesAge contribution may change as we
                                        // move the intent above it. A similar phenomenon occurs in
                                        // MVCCResolveWriteIntent.
                                        //
                                        // TODO(erikgrinaker): Consider using mvccGet() here instead, but
                                        // needs benchmarking.
                                        prevKey := oldVersionKey.Next()
                                        iter.SeekGE(prevKey)
                                        valid, err := iter.Valid()
                                        if err != nil {
                                                return false, roachpb.LockAcquisition{}, err
                                        } else if valid {
                                                // If we land on a bare range key, step onto the next key. This may
                                                // be a point key at the same key position, or a different key.
                                                if hasPoint, hasRange := iter.HasPointAndRange(); hasRange && !hasPoint {
                                                        iter.Next()
                                                        if valid, err = iter.Valid(); err != nil {
                                                                return false, roachpb.LockAcquisition{}, err
                                                        }
                                                }
                                        }
                                        if valid && iter.UnsafeKey().Key.Equal(prevKey.Key) {
                                                prevUnsafeKey := iter.UnsafeKey()
                                                if !prevUnsafeKey.IsValue() {
                                                        return false, roachpb.LockAcquisition{}, errors.Errorf("expected an MVCC value key: %s", prevUnsafeKey)
                                                }

                                                // We must now be on a point key, but it may be covered by an
                                                // existing MVCC range tombstone. If it isn't, account for it.
                                                _, hasRange := iter.HasPointAndRange()
                                                if !hasRange || iter.RangeKeys().Versions[0].Timestamp.Less(prevUnsafeKey.Timestamp) {
                                                        prevValLen, prevValIsTombstone, err := iter.MVCCValueLenAndIsTombstone()
                                                        if err != nil {
                                                                return false, roachpb.LockAcquisition{}, err
                                                        }
                                                        if !prevValIsTombstone {
                                                                prevIsValue = !prevValIsTombstone
                                                                prevValSize = int64(prevValLen)
                                                        }
                                                }
                                        }
                                        iter = nil // prevent accidental use below
                                }

                                // TODO(jackson): Do we know the encoded value size in the other
                                // cases?
                                if err := writer.ClearMVCC(oldVersionKey, ClearOptions{
                                        ValueSizeKnown: curProvValRaw != nil,
                                        ValueSize:      uint32(len(curProvValRaw)),
                                }); err != nil {
                                        return false, roachpb.LockAcquisition{}, err
                                }
                        } else if writeTimestamp.Less(metaTimestamp) {
                                // This case occurs when we're writing a key twice within a
                                // txn, and our timestamp has been pushed forward because of
                                // a write-too-old error on this key. For this case, we want
                                // to continue writing at the higher timestamp or else the
                                // MVCCMetadata could end up pointing *under* the newer write.
                                writeTimestamp = metaTimestamp
                        }
                        // Since an intent with a smaller sequence number exists for the
                        // same transaction, we must add the previous value and sequence to
                        // the intent history, if that previous value does not belong to an
                        // ignored sequence number.
                        //
                        // If the epoch of the transaction doesn't match the epoch of the
                        // intent, or if the existing value is nil due to all past writes
                        // being ignored and there are no other committed values, blow away
                        // the intent history.
                        //
                        // Note that the only case where txn.Epoch == meta.Txn.Epoch &&
                        // exVal == nil will be true is when all past writes by this
                        // transaction are ignored, and there are no past committed values
                        // at this key. In that case, we can also blow up the intent
                        // history.
                        if opts.Txn.Epoch == meta.Txn.Epoch && exVal.exists {
                                newIntentHistory = meta.IntentHistory
                                // Only add the current provisional value to the intent
                                // history if the current sequence number is not ignored. There's no
                                // reason to add past committed values or a value already in the intent
                                // history back into it.
                                if curProvValRaw != nil {
                                        prevIntent := enginepb.MVCCMetadata_SequencedIntent{
                                                Sequence: meta.Txn.Sequence,
                                                Value:    curProvValRaw,
                                        }
                                        newIntentHistory = append(newIntentHistory, prevIntent)
                                }
                        } else {
                                newIntentHistory = nil
                        }
                } else if readTimestamp.LessEq(metaTimestamp) {
                        // This is the case where we're trying to write under a committed value.
                        // Obviously we can't do that. We return a write-too-old error indicating
                        // the earliest valid timestamp that the writer would be able to perform
                        // such a write. This timestamp can then be used to increment the txn
                        // timestamp.
                        //
                        // NB: even if metaTimestamp is less than writeTimestamp, we can't
                        // avoid the WriteTooOld error if metaTimestamp is equal to or
                        // greater than readTimestamp. This is because certain operations
                        // like ConditionalPuts and InitPuts avoid ever needing refreshes
                        // by ensuring that they propagate WriteTooOld errors immediately
                        // instead of allowing their transactions to continue and be retried
                        // before committing.
                        writeTimestamp.Forward(metaTimestamp.Next())
                        writeTooOldErr := kvpb.NewWriteTooOldError(readTimestamp, writeTimestamp, key)
                        return false, roachpb.LockAcquisition{}, writeTooOldErr
                } else /* meta.Txn == nil && metaTimestamp.Less(readTimestamp) */ {
                        // If a valueFn is specified, read the existing value using iter.
                        opts := MVCCGetOptions{
                                Tombstones:   true,
                                ReadCategory: opts.Category,
                        }
                        if valueFn != nil {
                                exVal, _, err := mvccGet(ctx, iter, key, readTimestamp, opts)
                                if err != nil {
                                        return false, roachpb.LockAcquisition{}, err
                                }

                                value, err = valueFn(exVal)
                                if err != nil {
                                        return false, roachpb.LockAcquisition{}, err
                                }
                        }
                }
        } else /* !ok */ {
                // There is no existing value for this key. Even if the new value is
                // nil write a deletion tombstone for the key.
                if valueFn != nil {
                        value, err = valueFn(optionalValue{exists: false})
                        if err != nil {
                                return false, roachpb.LockAcquisition{}, err
                        }
                }
        }

        versionKey := metaKey
        versionKey.Timestamp = writeTimestamp

        versionValue := MVCCValue{}
        versionValue.Value = value
        versionValue.LocalTimestamp = opts.LocalTimestamp
        versionValue.OmitInRangefeeds = opts.OmitInRangefeeds
        versionValue.ImportEpoch = opts.ImportEpoch
        versionValue.OriginID = opts.OriginID
        if opts.OriginTimestamp.IsSet() {
                versionValue.OriginTimestamp = opts.OriginTimestamp
        }

        if buildutil.CrdbTestBuild {
                if seq, seqOK := kvnemesisutil.FromContext(ctx); seqOK {
                        versionValue.KVNemesisSeq.Set(seq)
                }
        }

        if !versionValue.LocalTimestampNeeded(versionKey.Timestamp) ||
                !writer.ShouldWriteLocalTimestamps(ctx) {
                versionValue.LocalTimestamp = hlc.ClockTimestamp{}
        }

        // Write the mvcc metadata now that we have sizes for the latest
        // versioned value. For values, the size of keys is always accounted
        // for as MVCCVersionTimestampSize. The size of the metadata key is
        // accounted for separately.
        newMeta := &buf.newMeta
        {
                var txnMeta *enginepb.TxnMeta
                if opts.Txn != nil {
                        txnMeta = &opts.Txn.TxnMeta
                        // If we bumped the WriteTimestamp, we update both the TxnMeta and the
                        // MVCCMetadata.Timestamp.
                        if txnMeta.WriteTimestamp != versionKey.Timestamp {
                                txnMetaCpy := *txnMeta
                                txnMetaCpy.WriteTimestamp.Forward(versionKey.Timestamp)
                                txnMeta = &txnMetaCpy
                        }
                }
                newMeta.Txn = txnMeta
        }
        newMeta.Timestamp = versionKey.Timestamp.ToLegacyTimestamp()
        newMeta.KeyBytes = MVCCVersionTimestampSize
        newMeta.ValBytes = int64(encodedMVCCValueSize(versionValue))
        newMeta.Deleted = versionValue.IsTombstone()
        newMeta.IntentHistory = newIntentHistory

        var metaKeySize, metaValSize int64
        var lockAcquisition roachpb.LockAcquisition
        if newMeta.Txn != nil {
                // Determine whether the transaction had previously written an intent on
                // this key and we intend to update that intent, or whether this is the
                // first time an intent is being written. ok represents the presence of a
                // meta (an actual intent or a manufactured meta). buf.meta.Txn!=nil
                // represents a non-manufactured meta, i.e., there is an intent.
                alreadyExists := ok && meta.Txn != nil
                // Write the intent metadata key.
                metaKeySize, metaValSize, err = buf.putLockMeta(
                        writer, metaKey.Key, lock.Intent, newMeta, alreadyExists)
                if err != nil {
                        return false, roachpb.LockAcquisition{}, err
                }
                lockAcquisition = roachpb.MakeLockAcquisition(
                        *newMeta.Txn, metaKey.Key, lock.Replicated, lock.Intent, opts.Txn.IgnoredSeqNums,
                )
        } else {
                // Per-key stats count the full-key once and MVCCVersionTimestampSize for
                // each versioned value. We maintain that accounting even when the MVCC
                // metadata is implicit.
                metaKeySize = int64(metaKey.EncodedSize())
        }

        // Write the mvcc value.
        //
        // NB: this was previously performed before the mvcc metadata write, but
        // benchmarking has show that performing this write after results in a 6%
        // throughput improvement on write-heavy workloads. The reason for this is
        // that the meta key is always ordered before the value key and that
        // RocksDB's skiplist memtable implementation includes a fast-path for
        // sequential insertion patterns.
        if err := writer.PutMVCC(versionKey, versionValue); err != nil {
                return false, roachpb.LockAcquisition{}, err
        }

        // Update MVCC stats.
        if opts.Stats != nil {
                // Adjust the stats metadata for MVCC range tombstones. The MVCC stats
                // update only cares about changes to real point keys, but the above logic
                // needs to care about MVCC range tombstones for conflict purposes.
                //
                // Specifically, if a real point key was covered by a range tombstone, we
                // must set meta.Timestamp to the timestamp where the real point key was
                // deleted (either by a point tombstone or the lowest range tombstone). If
                // there was no real point key, meta must be nil. In all other cases,
                // meta.Timestamp will already equal origRealKeyChanged.
                if origRealKeyChanged.IsEmpty() {
                        meta = nil // no real point key was found
                }
                if meta != nil {
                        meta.Timestamp = origRealKeyChanged.ToLegacyTimestamp()
                }
                opts.Stats.Add(updateStatsOnPut(key, prevIsValue, prevValSize, origMetaKeySize, origMetaValSize,
                        metaKeySize, metaValSize, meta, newMeta))
        }

        // Log the logical MVCC operation.
        logicalOpDetails := MVCCLogicalOpDetails{
                Key:       key,
                Timestamp: writeTimestamp,
                Safe:      true,
        }
        if txn := newMeta.Txn; txn != nil {
                logicalOpDetails.Txn = *txn
        }
        writer.LogLogicalOp(logicalOp, logicalOpDetails)
        return exReplaced, lockAcquisition, nil
}

// MVCCIncrement fetches the value for key, and assuming the value is
// an "integer" type, increments it by inc and stores the new
// value. The newly incremented value is returned.
//
// An initial value is read from the key using the same operational
// timestamp as we use to write a value.
//
// Note that, when writing transactionally, the txn's timestamps
// dictate the timestamp of the operation, and the timestamp parameter is
// confusing and redundant. See the comment on mvccPutInternal for details.
func MVCCIncrement(
        ctx context.Context,
        rw ReadWriter,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCWriteOptions,
        inc int64,
) (int64, roachpb.LockAcquisition, error) {
        iter, err := newMVCCIterator(
                ctx, rw, timestamp, false /* rangeKeyMasking */, true, /* noInterleavedIntents */
                IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        Prefix:       true,
                        ReadCategory: opts.Category,
                },
        )
        if err != nil {
                return 0, roachpb.LockAcquisition{}, err
        }
        defer iter.Close()

        inlineIncrement := timestamp.IsEmpty()
        var ltScanner *lockTableKeyScanner
        if !inlineIncrement {
                ltScanner, err = newLockTableKeyScanner(
                        ctx, rw, opts.TxnID(), lock.Intent, opts.MaxLockConflicts, opts.TargetLockConflictBytes, opts.Category)
                if err != nil {
                        return 0, roachpb.LockAcquisition{}, err
                }
                defer ltScanner.close()
        }

        var int64Val int64
        var newInt64Val int64
        valueFn := func(value optionalValue) (roachpb.Value, error) {
                if value.IsPresent() {
                        var err error
                        if int64Val, err = value.Value.GetInt(); err != nil {
                                return roachpb.Value{}, errors.Errorf("key %q does not contain an integer value", key)
                        }
                }

                // Check for overflow and underflow.
                if willOverflow(int64Val, inc) {
                        // Return the old value, since we've failed to modify it.
                        newInt64Val = int64Val
                        return roachpb.Value{}, &kvpb.IntegerOverflowError{
                                Key:            key,
                                CurrentValue:   int64Val,
                                IncrementValue: inc,
                        }
                }
                newInt64Val = int64Val + inc

                newValue := roachpb.Value{}
                newValue.SetInt(newInt64Val)
                newValue.InitChecksum(key)
                return newValue, nil
        }

        acq, err := mvccPutUsingIter(ctx, rw, iter, ltScanner, key, timestamp, noValue, valueFn, opts)
        return newInt64Val, acq, err
}

// CPutMissingBehavior describes the handling a non-existing expected value.
type CPutMissingBehavior bool

const (
        // CPutAllowIfMissing is used to indicate a CPut can also succeed when the
        // expected entry does not exist.
        CPutAllowIfMissing CPutMissingBehavior = true
        // CPutFailIfMissing is used to indicate the existing value must match the
        // expected value exactly i.e. if a value is expected, it must exist.
        CPutFailIfMissing CPutMissingBehavior = false
)

// ConditionalPutWriteOptions bundles options for the
// MVCCConditionalPut and MVCCBlindConditionalPut functions.
type ConditionalPutWriteOptions struct {
        MVCCWriteOptions

        AllowIfDoesNotExist CPutMissingBehavior
        // OriginTimestamp, if set, indicates that the caller wants to put the
        // value only if any existing key is older than this timestamp.
        //
        // See the comment on the OriginTimestamp field of
        // kvpb.ConditionalPutRequest for more details.
        OriginTimestamp hlc.Timestamp
        // ShouldWinOriginTimestampTie indicates whether the value should be
        // accepted if the origin timestamp is the same as the
        // origin_timestamp/mvcc_timestamp of the existing value.
        //
        // See the comment on the ShouldWinOriginTimestampTie field of
        // kvpb.ConditionalPutRequest for more details.
        ShouldWinOriginTimestampTie bool
}

// MVCCConditionalPut sets the value for a specified key only if the expected
// value matches. If not, the return a ConditionFailedError containing the
// actual value. An empty expVal signifies that the key is expected to not
// exist.
//
// The condition check reads a value from the key using the same operational
// timestamp as we use to write a value.
//
// Note that, when writing transactionally, the txn's timestamps
// dictate the timestamp of the operation, and the timestamp parameter is
// confusing and redundant. See the comment on mvccPutInternal for details.
//
// An empty expVal means that the key is expected to not exist. If not empty,
// expValue needs to correspond to a Value.TagAndDataBytes() - i.e. a key's
// value without the checksum (as the checksum includes the key too).
func MVCCConditionalPut(
        ctx context.Context,
        rw ReadWriter,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        expVal []byte,
        opts ConditionalPutWriteOptions,
) (roachpb.LockAcquisition, error) {
        iter, err := newMVCCIterator(
                ctx, rw, timestamp, false /* rangeKeyMasking */, true, /* noInterleavedIntents */
                IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        Prefix:       true,
                        ReadCategory: opts.Category,
                },
        )
        if err != nil {
                return roachpb.LockAcquisition{}, err
        }
        defer iter.Close()

        inlinePut := timestamp.IsEmpty()
        var ltScanner *lockTableKeyScanner
        if !inlinePut {
                ltScanner, err = newLockTableKeyScanner(
                        ctx, rw, opts.TxnID(), lock.Intent, opts.MaxLockConflicts, opts.TargetLockConflictBytes, opts.Category)
                if err != nil {
                        return roachpb.LockAcquisition{}, err
                }
                defer ltScanner.close()
        }
        return mvccConditionalPutUsingIter(
                ctx, rw, iter, ltScanner, key, timestamp, value, expVal, opts)
}

// MVCCBlindConditionalPut is a fast-path of MVCCConditionalPut. See the
// MVCCConditionalPut comments for details of the
// semantics. MVCCBlindConditionalPut skips retrieving the existing metadata
// for the key requiring the caller to guarantee no versions for the key
// currently exist.
//
// Note that, when writing transactionally, the txn's timestamps
// dictate the timestamp of the operation, and the timestamp parameter is
// confusing and redundant. See the comment on mvccPutInternal for details.
func MVCCBlindConditionalPut(
        ctx context.Context,
        writer Writer,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        expVal []byte,
        opts ConditionalPutWriteOptions,
) (roachpb.LockAcquisition, error) {
        return mvccConditionalPutUsingIter(
                ctx, writer, nil, nil, key, timestamp, value, expVal, opts)
}

// maybeConditionFailedError returns a non-nil ConditionFailedError if
// the expBytes and actVal don't match. If allowNoExisting is true,
// then a non-existent actual value is allowed even when
// expected-value is non-empty.
func maybeConditionFailedError(
        expBytes []byte, actVal optionalValue, allowNoExisting bool,
) *kvpb.ConditionFailedError {
        expValPresent := len(expBytes) != 0
        actValPresent := actVal.IsPresent()
        if expValPresent && actValPresent {
                if !bytes.Equal(expBytes, actVal.Value.TagAndDataBytes()) {
                        return &kvpb.ConditionFailedError{
                                ActualValue: actVal.ToPointer(),
                        }
                }
        } else if expValPresent != actValPresent && (actValPresent || !allowNoExisting) {
                return &kvpb.ConditionFailedError{
                        ActualValue: actVal.ToPointer(),
                }
        }
        return nil
}

func mvccConditionalPutUsingIter(
        ctx context.Context,
        writer Writer,
        iter MVCCIterator,
        ltScanner *lockTableKeyScanner,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        expBytes []byte,
        opts ConditionalPutWriteOptions,
) (roachpb.LockAcquisition, error) {
        if !opts.OriginTimestamp.IsEmpty() {
                if bool(opts.AllowIfDoesNotExist) {
                        return roachpb.LockAcquisition{}, errors.AssertionFailedf("AllowIfDoesNotExist and non-zero OriginTimestamp are incompatible")
                }
                putIsInline := timestamp.IsEmpty()
                if putIsInline {
                        return roachpb.LockAcquisition{}, errors.AssertionFailedf("inline put and non-zero OriginTimestamp are incompatible")
                }
        }

        var valueFn func(existVal optionalValue) (roachpb.Value, error)
        if opts.OriginTimestamp.IsEmpty() {
                valueFn = func(actualValue optionalValue) (roachpb.Value, error) {
                        if err := maybeConditionFailedError(expBytes, actualValue, bool(opts.AllowIfDoesNotExist)); err != nil {
                                return roachpb.Value{}, err
                        }
                        return value, nil
                }
        } else {
                valueFn = func(existVal optionalValue) (roachpb.Value, error) {
                        originTSWinner, existTS := existVal.isOriginTimestampWinner(opts.OriginTimestamp,
                                opts.ShouldWinOriginTimestampTie)
                        if !originTSWinner {
                                return roachpb.Value{}, &kvpb.ConditionFailedError{
                                        OriginTimestampOlderThan: existTS,
                                }
                        }

                        // We are the OriginTimestamp comparison winner. We
                        // check the expected bytes because a mismatch implies
                        // that the caller may have produced other commands with
                        // outdated data.
                        if err := maybeConditionFailedError(expBytes, existVal, false); err != nil {
                                err.HadNewerOriginTimestamp = true
                                return roachpb.Value{}, err
                        }
                        return value, nil
                }

                // TODO(ssd): We set the OriginTimestamp on our write
                // options to the originTimestamp passed to us. We
                // don't assert they are the same yet because it is
                // still unclear how exactly we want to manage this in
                // the long run.
                opts.MVCCWriteOptions.OriginTimestamp = opts.OriginTimestamp
        }

        return mvccPutUsingIter(ctx, writer, iter, ltScanner, key, timestamp, noValue, valueFn, opts.MVCCWriteOptions)
}

// MVCCInitPut sets the value for a specified key if the key doesn't exist. It
// returns a ConditionFailedError when the write fails or if the key exists with
// an existing value that is different from the supplied value. If
// failOnTombstones is set to true, tombstones count as mismatched values and
// will cause a ConditionFailedError.
//
// Note that, when writing transactionally, the txn's timestamps
// dictate the timestamp of the operation, and the timestamp parameter is
// confusing and redundant. See the comment on mvccPutInternal for details.
func MVCCInitPut(
        ctx context.Context,
        rw ReadWriter,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        failOnTombstones bool,
        opts MVCCWriteOptions,
) (roachpb.LockAcquisition, error) {
        iter, err := newMVCCIterator(
                ctx, rw, timestamp, false /* rangeKeyMasking */, true, /* noInterleavedIntents */
                IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        Prefix:       true,
                        ReadCategory: opts.Category,
                },
        )
        if err != nil {
                return roachpb.LockAcquisition{}, err
        }
        defer iter.Close()

        inlinePut := timestamp.IsEmpty()
        var ltScanner *lockTableKeyScanner
        if !inlinePut {
                ltScanner, err = newLockTableKeyScanner(
                        ctx, rw, opts.TxnID(), lock.Intent, opts.MaxLockConflicts, opts.TargetLockConflictBytes, opts.Category)
                if err != nil {
                        return roachpb.LockAcquisition{}, err
                }
                defer ltScanner.close()
        }

        return mvccInitPutUsingIter(ctx, rw, iter, ltScanner, key, timestamp, value, failOnTombstones, opts)
}

// MVCCBlindInitPut is a fast-path of MVCCInitPut. See the MVCCInitPut
// comments for details of the semantics. MVCCBlindInitPut skips
// retrieving the existing metadata for the key requiring the caller
// to guarantee no version for the key currently exist.
//
// Note that, when writing transactionally, the txn's timestamps
// dictate the timestamp of the operation, and the timestamp parameter is
// confusing and redundant. See the comment on mvccPutInternal for details.
func MVCCBlindInitPut(
        ctx context.Context,
        w Writer,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        failOnTombstones bool,
        opts MVCCWriteOptions,
) (roachpb.LockAcquisition, error) {
        return mvccInitPutUsingIter(
                ctx, w, nil, nil, key, timestamp, value, failOnTombstones, opts)
}

func mvccInitPutUsingIter(
        ctx context.Context,
        w Writer,
        iter MVCCIterator,
        ltScanner *lockTableKeyScanner,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
        failOnTombstones bool,
        opts MVCCWriteOptions,
) (roachpb.LockAcquisition, error) {
        valueFn := func(existVal optionalValue) (roachpb.Value, error) {
                if failOnTombstones && existVal.IsTombstone() {
                        // We found a tombstone and failOnTombstones is true: fail.
                        return roachpb.Value{}, &kvpb.ConditionFailedError{
                                ActualValue: existVal.ToPointer(),
                        }
                }
                if existVal.IsPresent() && !existVal.Value.EqualTagAndData(value) {
                        // The existing value does not match the supplied value.
                        return roachpb.Value{}, &kvpb.ConditionFailedError{
                                ActualValue: existVal.ToPointer(),
                        }
                }
                return value, nil
        }
        return mvccPutUsingIter(ctx, w, iter, ltScanner, key, timestamp, noValue, valueFn, opts)
}

// mvccKeyFormatter is an fmt.Formatter for MVCC Keys.
type mvccKeyFormatter struct {
        key MVCCKey
        err error
}

var _ fmt.Formatter = mvccKeyFormatter{}

// Format implements the fmt.Formatter interface.
func (m mvccKeyFormatter) Format(f fmt.State, c rune) {
        if m.err != nil {
                errors.FormatError(m.err, f, c)
                return
        }
        m.key.Format(f, c)
}

// MVCCMerge implements a merge operation. Merge adds integer values,
// concatenates undifferentiated byte slice values, and efficiently
// combines time series observations if the roachpb.Value tag value
// indicates the value byte slice is of type TIMESERIES.
//
// Merges are not really MVCC operations: they operate on inline values with no
// version, and do not check for conflicts with other MVCC versions.
func MVCCMerge(
        _ context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        key roachpb.Key,
        timestamp hlc.Timestamp,
        value roachpb.Value,
) error {
        if len(key) == 0 {
                return emptyKeyError()
        }
        metaKey := MakeMVCCMetadataKey(key)

        buf := newPutBuffer()
        defer buf.release()

        // Every type flows through here, so we can't use the typed getters.
        rawBytes := value.RawBytes

        // Encode and merge the MVCC metadata with inlined value.
        meta := &buf.meta
        *meta = enginepb.MVCCMetadata{RawBytes: rawBytes}
        // If non-zero, set the merge timestamp to provide some replay protection.
        if !timestamp.IsEmpty() {
                buf.ts = timestamp.ToLegacyTimestamp()
                meta.MergeTimestamp = &buf.ts
        }
        data, err := buf.marshalMeta(meta)
        if err == nil {
                if err = rw.Merge(metaKey, data); err == nil && ms != nil {
                        ms.Add(updateStatsOnMerge(
                                key, int64(len(rawBytes)), timestamp.WallTime))
                }
        }
        return err
}

// MVCCClearTimeRange clears all MVCC versions (point keys and range keys)
// within the span [key, endKey) which have timestamps in the span
// (startTime, endTime]. This can have the apparent effect of "reverting" the
// range to startTime if all of the older revisions of cleared keys are still
// available (i.e. have not been GC'ed).
//
// Long runs of point keys that all qualify for clearing will be cleared via a
// single clear-range operation, as specified by clearRangeThreshold. Once
// maxBatchSize Clear and ClearRange operations are hit during iteration, the
// next matching key is instead returned in the resumeSpan. It is possible to
// exceed maxBatchSize by up to the size of the buffer of keys selected for
// deletion but not yet flushed (as done to detect long runs for cleaning in a
// single ClearRange).
//
// Limiting the number of keys or ranges of keys processed can still cause a
// batch that is too large -- in number of bytes -- for raft to replicate if the
// keys are very large. So if the total length of the keys or key spans cleared
// exceeds maxBatchByteSize it will also stop and return a resume span.
//
// leftPeekBound and rightPeekBound are bounds that will be used to peek for
// surrounding MVCC range keys that may be merged or fragmented by our range key
// clears. They should correspond to the latches held by the command, and not
// exceed the Raft range boundaries.
//
// This function handles the stats computations to determine the correct
// incremental deltas of clearing these keys (and correctly determining if it
// does or not not change the live and gc keys).
//
// If the function encounters an intent (with any timestamp), a replicated lock,
// or any inline meta in the key span, it will return an error.
//
// TODO(erikgrinaker): endTime does not actually work here -- if there are keys
// above endTime then the stats do not properly account for them. We probably
// don't need those semantics, so consider renaming this to MVCCRevertRange and
// removing the endTime parameter.
func MVCCClearTimeRange(
        ctx context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        key, endKey roachpb.Key,
        startTime, endTime hlc.Timestamp,
        leftPeekBound, rightPeekBound roachpb.Key,
        clearRangeThreshold int,
        maxBatchSize, maxBatchByteSize int64,
        maxLockConflicts int64,
) (roachpb.Key, error) {
        var batchSize, batchByteSize int64
        var resumeKey roachpb.Key

        if clearRangeThreshold == 0 {
                clearRangeThreshold = 2
        }
        if maxBatchSize == 0 {
                maxBatchSize = math.MaxInt64
        }
        if maxBatchByteSize == 0 {
                maxBatchByteSize = math.MaxInt64
        }
        if rightPeekBound == nil {
                rightPeekBound = keys.MaxKey
        }

        // endTime must be set. Otherwise, MVCCIncrementalIterator defaults to
        // MaxTimestamp, unlike the code below which uses it literally.
        if endTime.IsEmpty() {
                return nil, errors.New("end time is required")
        }

        // endTime must also be above startTime.
        if endTime.LessEq(startTime) {
                return nil, errors.Errorf("end time %s must be above start time %s", endTime, startTime)
        }

        // Since we're setting up multiple iterators, we require consistent iterators.
        if !rw.ConsistentIterators() {
                return nil, errors.AssertionFailedf("requires consistent iterators")
        }

        // Check for any overlapping locks (at any timestamp), and return them to
        // be resolved. We don't _expect_ to hit any since the RevertRange is only
        // intended for non-live key spans, but there could be an intent or lock
        // leftover from before the keyspace become non-live.
        if locks, err := ScanLocks(
                ctx, rw, key, endKey, maxLockConflicts, 0); err != nil {
                return nil, err
        } else if len(locks) > 0 {
                return nil, &kvpb.LockConflictError{Locks: locks}
        }

        // When iterating, instead of immediately clearing a matching key we can
        // accumulate it in buf until either a) clearRangeThreshold is reached and
        // we discard the buffer, instead just keeping track of where the span of keys
        // matching started or b) a non-matching key is seen and we flush the buffer
        // keys one by one as Clears. Once we switch to just tracking where the run
        // started, on seeing a non-matching key we flush the run via one ClearRange.
        // This can be a big win for reverting bulk-ingestion of clustered data as the
        // entire span may likely match and thus could be cleared in one ClearRange
        // instead of hundreds of thousands of individual Clears.
        type bufferedKey struct {
                MVCCKey
                valLen uint32
        }
        buf := make([]bufferedKey, clearRangeThreshold)
        var bufSize int
        var clearRangeStart MVCCKey

        clearMatchingKey := func(k MVCCKey, valLen uint32) {
                if len(clearRangeStart.Key) == 0 {
                        // Currently buffering keys to clear one-by-one.
                        if bufSize < clearRangeThreshold {
                                buf[bufSize].Key = append(buf[bufSize].Key[:0], k.Key...)
                                buf[bufSize].Timestamp = k.Timestamp
                                buf[bufSize].valLen = valLen
                                bufSize++
                        } else {
                                // Buffer is now full -- switch to just tracking the start of the range
                                // from which we will clear when we either see a non-matching key or if
                                // we finish iterating.
                                clearRangeStart = buf[0].MVCCKey
                                bufSize = 0
                        }
                }
        }

        flushClearedKeys := func(nonMatch MVCCKey) error {
                if len(clearRangeStart.Key) != 0 {
                        if err := rw.ClearMVCCVersions(clearRangeStart, nonMatch); err != nil {
                                return err
                        }
                        batchByteSize += int64(clearRangeStart.EncodedSize() + nonMatch.EncodedSize())
                        batchSize++
                        clearRangeStart = MVCCKey{}
                } else if bufSize > 0 {
                        var encodedBufSize int64
                        for i := 0; i < bufSize; i++ {
                                encodedBufSize += int64(buf[i].MVCCKey.EncodedSize())
                        }
                        // Even though we didn't get a large enough number of keys to switch to
                        // clearrange, the byte size of the keys we did get is now too large to
                        // encode them all within the byte size limit, so use clearrange anyway.
                        if batchByteSize+encodedBufSize >= maxBatchByteSize {
                                if err := rw.ClearMVCCVersions(buf[0].MVCCKey, nonMatch); err != nil {
                                        return err
                                }
                                batchByteSize += int64(buf[0].EncodedSize() + nonMatch.EncodedSize())
                                batchSize++
                        } else {
                                for i := 0; i < bufSize; i++ {
                                        if buf[i].Timestamp.IsEmpty() {
                                                // Inline metadata. Not an intent because iteration below fails
                                                // if it sees an intent.
                                                if err := rw.ClearUnversioned(buf[i].Key, ClearOptions{
                                                        ValueSizeKnown: true,
                                                        ValueSize:      buf[i].valLen,
                                                }); err != nil {
                                                        return err
                                                }
                                        } else {
                                                if err := rw.ClearMVCC(buf[i].MVCCKey, ClearOptions{
                                                        ValueSizeKnown: true,
                                                        ValueSize:      buf[i].valLen,
                                                }); err != nil {
                                                        return err
                                                }
                                        }
                                }
                                batchByteSize += encodedBufSize
                                batchSize += int64(bufSize)
                        }
                        bufSize = 0
                }
                return nil
        }

        // We also buffer the range key stack to clear, and flush it when we hit a new
        // stack.
        //
        // TODO(erikgrinaker): For now, we remove individual range keys. We could do
        // something similar to point keys and keep track of long runs to remove, but
        // we expect them to be rare so this should be fine for now.
        var clearRangeKeys MVCCRangeKeyStack

        flushRangeKeys := func(resumeKey roachpb.Key) error {
                if clearRangeKeys.IsEmpty() {
                        return nil
                }
                if len(resumeKey) > 0 {
                        if resumeKey.Compare(clearRangeKeys.Bounds.Key) <= 0 {
                                return nil
                        } else if resumeKey.Compare(clearRangeKeys.Bounds.EndKey) <= 0 {
                                clearRangeKeys.Bounds.EndKey = resumeKey
                        }
                }

                // Fetch the existing range keys (if any), to adjust MVCC stats. We set up
                // a new iterator for every batch, which both sees our own writes as well as
                // any range keys outside of the time bounds.
                rkIter, err := rw.NewMVCCIterator(ctx, MVCCKeyIterKind, IterOptions{
                        KeyTypes:     IterKeyTypeRangesOnly,
                        LowerBound:   leftPeekBound,
                        UpperBound:   rightPeekBound,
                        ReadCategory: fs.BatchEvalReadCategory,
                })
                if err != nil {
                        return err
                }
                defer rkIter.Close()

                cmp, remaining, err := PeekRangeKeysRight(rkIter, clearRangeKeys.Bounds.Key)
                if err != nil {
                        return err
                } else if cmp > 0 || !remaining.Bounds.Contains(clearRangeKeys.Bounds) {
                        return errors.AssertionFailedf("did not find expected range key at %s", clearRangeKeys.Bounds)
                } else if !remaining.IsEmpty() {
                        // Truncate the bounds to the cleared span, so that stats operate on the
                        // post-fragmented state (if relevant).
                        remaining.Bounds = clearRangeKeys.Bounds
                }

                // Clear the range keys.
                for _, v := range clearRangeKeys.Versions {
                        rangeKey := clearRangeKeys.AsRangeKey(v)
                        if err := rw.ClearMVCCRangeKey(rangeKey); err != nil {
                                return err
                        }
                        batchSize++
                        batchByteSize += int64(rangeKey.EncodedSize())

                        if ms != nil {
                                ms.Add(updateStatsOnRangeKeyClearVersion(remaining, v))
                        }
                        if _, ok := remaining.Remove(v.Timestamp); !ok {
                                return errors.AssertionFailedf("did not find expected range key %s", rangeKey)
                        }
                }
                remaining = remaining.Clone()

                // Update stats for any fragmentation or merging caused by the clears around
                // the bounds.
                if ms != nil {
                        if cmp, lhs, err := PeekRangeKeysLeft(rkIter, clearRangeKeys.Bounds.Key); err != nil {
                                return err
                        } else if cmp > 0 {
                                ms.Add(UpdateStatsOnRangeKeySplit(clearRangeKeys.Bounds.Key, lhs.Versions))
                        } else if cmp == 0 && lhs.CanMergeRight(remaining) {
                                ms.Add(updateStatsOnRangeKeyMerge(clearRangeKeys.Bounds.Key, lhs.Versions))
                        }

                        if cmp, rhs, err := PeekRangeKeysRight(rkIter, clearRangeKeys.Bounds.EndKey); err != nil {
                                return err
                        } else if cmp < 0 {
                                ms.Add(UpdateStatsOnRangeKeySplit(clearRangeKeys.Bounds.EndKey, rhs.Versions))
                        } else if cmp == 0 && remaining.CanMergeRight(rhs) {
                                ms.Add(updateStatsOnRangeKeyMerge(clearRangeKeys.Bounds.EndKey, rhs.Versions))
                        }
                }

                clearRangeKeys.Clear()
                return nil
        }

        // Using the IncrementalIterator with the time-bound iter optimization could
        // potentially be a big win here -- the expected use-case for this is to run
        // over an entire table's span with a very recent timestamp, rolling back just
        // the writes of some failed IMPORT and that could very likely only have hit
        // some small subset of the table's keyspace. However to get the stats right
        // we need a non-time-bound iter e.g. we need to know if there is an older key
        // under the one we are clearing to know if we're changing the number of live
        // keys. The MVCCIncrementalIterator uses a non-time-bound iter as its source
        // of truth, and only uses the TBI iterator as an optimization when finding
        // the next KV to iterate over. This pattern allows us to quickly skip over
        // swaths of uninteresting keys, but then use a normal iteration to actually
        // do the delete including updating the live key stats correctly.
        //
        // We've already scanned over the lock table with the call to ScanLocks above,
        // so we disable intent interleaving.
        iter, err := NewMVCCIncrementalIterator(ctx, rw, MVCCIncrementalIterOptions{
                KeyTypes:     IterKeyTypePointsAndRanges,
                IntentPolicy: MVCCIncrementalIterIntentPolicyIgnore,
                StartKey:     key,
                EndKey:       endKey,
                StartTime:    startTime,
                EndTime:      endTime,
                ReadCategory: fs.BatchEvalReadCategory,
        })
        if err != nil {
                return nil, err
        }
        defer iter.Close()

        // clearedMetaKey is the latest surfaced key that will get cleared.
        var clearedMetaKey MVCCKey

        // clearedMeta contains metadata on the clearedMetaKey.
        var clearedMeta enginepb.MVCCMetadata

        // restoredMeta contains metadata on the previous version the clearedMetaKey.
        // Once the key in clearedMetaKey is cleared, the key represented in
        // restoredMeta becomes the latest version of this MVCC key.
        var restoredMeta enginepb.MVCCMetadata

        iter.SeekGE(MVCCKey{Key: key})
        for {
                if ok, err := iter.Valid(); err != nil {
                        return nil, err
                } else if !ok {
                        break
                }

                k := iter.UnsafeKey()

                // If we encounter a new range key stack, flush the previous range keys (if
                // any) and buffer these for clearing.
                //
                // NB: RangeKeyChangedIgnoringTime() may fire on a hidden range key outside
                // of the time bounds, because of NextIgnoringTime(), in which case
                // HasPointAndRange() will return false,false.
                if iter.RangeKeyChangedIgnoringTime() {
                        if err := flushRangeKeys(nil); err != nil { // empties clearRangeKeys
                                return nil, err
                        }
                        if batchSize >= maxBatchSize || batchByteSize >= maxBatchByteSize {
                                resumeKey = k.Key.Clone()
                                break
                        }

                        hasPoint, hasRange := iter.HasPointAndRange()
                        if hasRange {
                                iter.RangeKeys().CloneInto(&clearRangeKeys)
                        }
                        if !hasPoint {
                                // If we landed on a bare range tombstone, we need to check if it revealed
                                // anything below the time bounds as well.
                                iter.NextIgnoringTime()
                                continue
                        }
                }

                // Process point keys.
                valueLen, valueIsTombstone, err := iter.MVCCValueLenAndIsTombstone()
                if err != nil {
                        return nil, err
                }

                // First, account for the point key that we cleared previously.
                if len(clearedMetaKey.Key) > 0 {
                        metaKeySize := int64(clearedMetaKey.EncodedSize())
                        if clearedMetaKey.Key.Equal(k.Key) {
                                // Since the key matches, our previous clear "restored" this revision of
                                // the this key, so update the stats with this as the "restored" key.
                                restoredMeta.KeyBytes = MVCCVersionTimestampSize
                                restoredMeta.ValBytes = int64(valueLen)
                                restoredMeta.Deleted = valueIsTombstone
                                restoredMeta.Timestamp = k.Timestamp.ToLegacyTimestamp()

                                // If there was an MVCC range tombstone between this version and the
                                // cleared key, then we didn't restore it after all, but we must still
                                // adjust the stats for the range tombstone. RangeKeysIgnoringTime()
                                // is cheap, so we don't need any caching here.
                                if !restoredMeta.Deleted {
                                        if rangeKeys := iter.RangeKeysIgnoringTime(); !rangeKeys.IsEmpty() {
                                                if v, ok := rangeKeys.FirstAtOrAbove(k.Timestamp); ok {
                                                        if v.Timestamp.LessEq(clearedMeta.Timestamp.ToTimestamp()) {
                                                                restoredMeta.Deleted = true
                                                                restoredMeta.KeyBytes = 0
                                                                restoredMeta.ValBytes = 0
                                                                restoredMeta.Timestamp = v.Timestamp.ToLegacyTimestamp()
                                                        }
                                                }
                                        }
                                }

                                if ms != nil {
                                        ms.Add(updateStatsOnClear(clearedMetaKey.Key, metaKeySize, 0, metaKeySize, 0,
                                                &clearedMeta, &restoredMeta, restoredMeta.Timestamp.WallTime))
                                }
                        } else {
                                if ms != nil {
                                        ms.Add(updateStatsOnClear(clearedMetaKey.Key, metaKeySize, 0, 0, 0, &clearedMeta, nil, 0))
                                }
                        }
                }

                // Eagerly check whether we've exceeded the batch size. If we return a
                // resumeKey we may truncate the buffered MVCC range tombstone clears
                // at the current key, in which case we can't record the current point
                // key as restored by the range tombstone clear below.
                if batchSize >= maxBatchSize || batchByteSize >= maxBatchByteSize {
                        resumeKey = k.Key.Clone()
                        clearedMetaKey.Key = clearedMetaKey.Key[:0]
                        break
                }

                // Check if the current key was restored by a range tombstone clear, and
                // adjust stats accordingly. We've already accounted for the clear of the
                // previous point key above. We must also check that the clear actually
                // revealed the key, since it may have been covered by the point key that
                // we cleared or a different range tombstone below the one we cleared.
                if !valueIsTombstone {
                        if v, ok := clearRangeKeys.FirstAtOrAbove(k.Timestamp); ok {
                                if !clearedMetaKey.Key.Equal(k.Key) ||
                                        !clearedMeta.Timestamp.ToTimestamp().LessEq(v.Timestamp) {
                                        rangeKeys := iter.RangeKeysIgnoringTime()
                                        if rangeKeys.IsEmpty() || !rangeKeys.HasBetween(k.Timestamp, v.Timestamp.Prev()) {
                                                ms.Add(enginepb.MVCCStats{
                                                        LastUpdateNanos: v.Timestamp.WallTime,
                                                        LiveCount:       1,
                                                        LiveBytes:       int64(k.EncodedSize()) + int64(valueLen),
                                                })
                                        }
                                }
                        }
                }

                clearedMetaKey.Key = clearedMetaKey.Key[:0]

                if startTime.Less(k.Timestamp) && k.Timestamp.LessEq(endTime) {
                        clearMatchingKey(k, uint32(valueLen))
                        clearedMetaKey.Key = append(clearedMetaKey.Key[:0], k.Key...)
                        clearedMeta.KeyBytes = MVCCVersionTimestampSize
                        clearedMeta.ValBytes = int64(valueLen)
                        clearedMeta.Deleted = valueIsTombstone
                        clearedMeta.Timestamp = k.Timestamp.ToLegacyTimestamp()

                        // Move the iterator to the next key/value in linear iteration even if it
                        // lies outside (startTime, endTime].
                        //
                        // If iter lands on an older version of the current key, we will update
                        // the stats in the next iteration of the loop. This is necessary to
                        // report accurate stats as we have "uncovered" an older version of the
                        // key by clearing the current version.
                        //
                        // If iter lands on the next key, it will either add to the current run of
                        // keys to be cleared, or trigger a flush depending on whether or not it
                        // lies in our time bounds respectively.
                        iter.NextIgnoringTime()
                } else {
                        // This key does not match, so we need to flush our run of matching keys.
                        if err := flushClearedKeys(k); err != nil {
                                return nil, err
                        }
                        // Move the incremental iterator to the next valid key that can be rolled
                        // back. If TBI was enabled when initializing the incremental iterator,
                        // this step could jump over large swaths of keys that do not qualify for
                        // clearing. However, if we've cleared any range keys, then we need to
                        // skip to the next key ignoring time, because it may not have been
                        // revealed.
                        if !clearRangeKeys.IsEmpty() {
                                iter.NextKeyIgnoringTime()
                        } else {
                                iter.Next()
                        }
                }
        }

        if len(clearedMetaKey.Key) > 0 && ms != nil {
                // If we cleared on the last iteration, no older revision of that key was
                // "restored", since otherwise we would have iterated over it.
                origMetaKeySize := int64(clearedMetaKey.EncodedSize())
                ms.Add(updateStatsOnClear(clearedMetaKey.Key, origMetaKeySize, 0, 0, 0, &clearedMeta, nil, 0))
        }

        if err := flushRangeKeys(resumeKey); err != nil {
                return nil, err
        }

        flushKey := endKey
        if len(resumeKey) > 0 {
                flushKey = resumeKey
        }
        return resumeKey, flushClearedKeys(MVCCKey{Key: flushKey})
}

// MVCCDeleteRange deletes the range of key/value pairs specified by start and
// end keys. It returns the range of keys deleted when returnedKeys is set,
// the next span to resume from, and the number of keys deleted.
// The returned resume span is nil if max keys aren't processed.
// The choice max=0 disables the limit.
func MVCCDeleteRange(
        ctx context.Context,
        rw ReadWriter,
        key, endKey roachpb.Key,
        max int64,
        timestamp hlc.Timestamp,
        opts MVCCWriteOptions,
        returnKeys bool,
) ([]roachpb.Key, *roachpb.Span, int64, []roachpb.LockAcquisition, error) {
        // Scan to find the keys to delete.
        //
        // For a versioned delete range, scan at the request timestamp and with the
        // FailOnMoreRecent option set to true. Doing so returns all non-tombstoned
        // keys at or below the request timestamp and throws a WriteTooOld error on
        // any key (mvcc tombstone or otherwise) above the request timestamp. This is
        // different from scanning at MaxTimestamp and deferring write-write conflict
        // checking to mvccPutInternal below. That approach ignores mvcc tombstones
        // above the request timestamp, which could lead to serializability anomalies
        // (see #56458).
        //
        // For an inline delete range, scan at MaxTimestamp. Doing so is not needed to
        // retrieve inline values, but it ensures that all non-inline values are also
        // returned. It is incompatible to mix an inline delete range with mvcc
        // versions, so we want to pass these incompatible keys to mvccPutInternal to
        // detect the condition and return an error. We also scan with the
        // FailOnMoreRecent set to false. This is not strictly necessary (nothing is
        // more recent than MaxTimestamp), but it provides added protection against
        // the scan returning a WriteTooOld error.
        scanTs := timestamp
        failOnMoreRecent := true
        if timestamp.IsEmpty() /* inline */ {
                scanTs = hlc.MaxTimestamp
                failOnMoreRecent = false
        }
        // In order for this operation to be idempotent when run transactionally, we
        // need to perform the initial scan at the previous sequence number so that
        // we don't see the result from equal or later sequences.
        var scanTxn *roachpb.Transaction
        if opts.Txn != nil {
                prevSeqTxn := opts.Txn.Clone()
                prevSeqTxn.Sequence--
                scanTxn = prevSeqTxn
        }
        res, err := MVCCScan(ctx, rw, key, endKey, scanTs, MVCCScanOptions{
                FailOnMoreRecent:        failOnMoreRecent,
                Txn:                     scanTxn,
                MaxKeys:                 max,
                MaxLockConflicts:        opts.MaxLockConflicts,
                TargetLockConflictBytes: opts.TargetLockConflictBytes,
                ReadCategory:            opts.Category,
        })
        if err != nil {
                return nil, nil, 0, nil, err
        }

        iter, err := newMVCCIterator(
                ctx, rw, timestamp, false /* rangeKeyMasking */, true, /* noInterleavedIntents */
                IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        Prefix:       true,
                        ReadCategory: opts.Category,
                },
        )
        if err != nil {
                return nil, nil, 0, nil, err
        }
        defer iter.Close()

        inlineDelete := timestamp.IsEmpty()
        var ltScanner *lockTableKeyScanner
        if !inlineDelete {
                ltScanner, err = newLockTableKeyScanner(
                        ctx, rw, opts.TxnID(), lock.Intent, opts.MaxLockConflicts, opts.TargetLockConflictBytes, opts.Category)
                if err != nil {
                        return nil, nil, 0, nil, err
                }
                defer ltScanner.close()
        }

        buf := newPutBuffer()
        defer buf.release()

        var keys []roachpb.Key
        var acqs []roachpb.LockAcquisition
        for i, kv := range res.KVs {
                _, acq, err := mvccPutInternal(
                        ctx, rw, iter, ltScanner, kv.Key, timestamp, noValue, buf, nil, opts,
                )
                if err != nil {
                        return nil, nil, 0, nil, err
                }
                if returnKeys {
                        if i == 0 {
                                keys = make([]roachpb.Key, len(res.KVs))
                                acqs = make([]roachpb.LockAcquisition, 0, len(res.KVs))
                        }
                        keys[i] = kv.Key
                        if !acq.Empty() {
                                // We only want to return non-empty lock acquisitions up the stack.
                                acqs = append(acqs, acq)
                        }
                }
        }
        return keys, res.ResumeSpan, res.NumKeys, acqs, nil
}

// MVCCPredicateDeleteRange issues MVCC tombstones at endTime to live keys
// within the span [startKey, endKey) that also have MVCC versions that match
// the predicate filters. Long runs of matched keys will get deleted with a
// range Tombstone, while smaller runs will get deleted with point tombstones.
// The keyspaces of each run do not overlap.
//
// This operation is non-transactional, but will check for existing intents in
// the target key span, regardless of timestamp, and return a LockConflictError
// containing up to maxLockConflicts locks.
//
// MVCCPredicateDeleteRange will return with a resumeSpan if the number of tombstones
// written exceeds maxBatchSize or the size of the written tombstones exceeds maxByteSize.
// These constraints prevent overwhelming raft.
//
// If an MVCC key surfaced has a timestamp at or above endTime,
// MVCCPredicateDeleteRange returns a WriteTooOldError without a resumeSpan,
// even if tombstones were already written to disk. To resolve, the caller
// should retry the call at a higher timestamp, assuming they have the
// appropriate level of isolation (e.g. the span covers an offline table, in the
// case of IMPORT rollbacks).
//
// An example of how this works: Issuing DeleteRange[a,e)@3 with
// Predicate{StartTime=1} on the following keys would issue tombstones at a@3,
// b@3, and d@3.
//
// t3
// t2 a2 b2    d2 e2
// t1    b1 c1
//
//        a  b  c  d  e
func MVCCPredicateDeleteRange(
        ctx context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        startKey, endKey roachpb.Key,
        endTime hlc.Timestamp,
        localTimestamp hlc.ClockTimestamp,
        leftPeekBound, rightPeekBound roachpb.Key,
        predicates kvpb.DeleteRangePredicates,
        maxBatchSize, maxBatchByteSize int64,
        rangeTombstoneThreshold int64,
        maxLockConflicts int64,
        targetLockConflictBytes int64,
) (*roachpb.Span, error) {
        if endTime.IsEmpty() {
                return nil, errors.AssertionFailedf("MVCCPredicateDeleteRange expects non-empty endTime")
        }
        if ms == nil {
                return nil, errors.AssertionFailedf(
                        "MVCCStats passed in to MVCCPredicateDeleteRange must be non-nil to ensure proper stats" +
                                " computation during Delete operations")
        }

        if maxBatchSize == 0 {
                // Set maxBatchSize to a large number to ensure MVCCPredicateDeleteRange
                // doesn't return early due to batch size. Note that maxBatchSize is only
                // set to 0 during testing.
                maxBatchSize = math.MaxInt64
        }

        // batchSize is the number tombstones (point and range) that have been flushed.
        var batchSize int64
        var batchByteSize int64

        // runSize is the number tombstones (point and range) that will get flushed in
        // the current run.
        var runSize int64
        var runByteSize int64

        var runStart, runEnd roachpb.Key

        // buf holds keys that we might need to issue point deletes
        // for. We copy the keys using keyAlloc, truncating keyAlloc
        // if we don't send the point deletes and creating a new
        // keyAlloc if we do send the point deletes.
        var keyAlloc bufalloc.ByteAllocator
        buf := make([]roachpb.Key, 0, rangeTombstoneThreshold)

        // Check for any overlapping locks, and return them to be resolved.
        if locks, err := ScanLocks(
                ctx, rw, startKey, endKey, maxLockConflicts, targetLockConflictBytes); err != nil {
                return nil, err
        } else if len(locks) > 0 {
                return nil, &kvpb.LockConflictError{Locks: locks}
        }

        var stopRunBasedOnPredicate func(k MVCCKey, iter *MVCCIncrementalIterator) (bool, error)
        if predicates.ImportEpoch > 0 {
                // TODO(ssd): We will likely eventually want something
                // that consturcts our iterator opetions based on the
                // predicate so that we can use a block-property
                // filter for import epochs.
                stopRunBasedOnPredicate = func(k MVCCKey, it *MVCCIncrementalIterator) (bool, error) {
                        rawV, err := it.UnsafeValue()
                        if err != nil {
                                return true, err
                        }
                        v, err := DecodeMVCCValue(rawV)
                        if err != nil {
                                return true, err
                        }
                        return v.ImportEpoch != predicates.ImportEpoch, nil
                }
        } else {
                stopRunBasedOnPredicate = func(k MVCCKey, _ *MVCCIncrementalIterator) (bool, error) {
                        return k.Timestamp.LessEq(predicates.StartTime), nil
                }
        }

        // continueRun returns three bools: the first is true if the current run
        // should continue; the second is true if the latest key is a point tombstone;
        // the third is true if the latest key is a range tombstone. If a non-nil
        // error is returned, the booleans are invalid. The run should continue if:
        //
        //  1) The latest version of the key is a point or range tombstone, with a
        //  timestamp below the client provided EndTime. Since the goal is to create
        //  long runs, any tombstoned key should continue the run.
        //
        //  2) The latest key is live, matches the predicates, and has a
        //  timestamp below EndTime.
        continueRun := func(k MVCCKey, iter *MVCCIncrementalIterator,
        ) (toContinue bool, isPointTombstone bool, isRangeTombstone bool, err error) {
                // We need to see the full, unfiltered set of range keys, ignoring time
                // bounds. The RangeKeysIgnoringTime() call is cheap.
                hasPointKey, _ := iter.HasPointAndRange()
                rangeKeys := iter.RangeKeysIgnoringTime()
                hasRangeKey := !rangeKeys.IsEmpty()

                if hasRangeKey {
                        newestRangeKey := rangeKeys.Newest()
                        if endTime.LessEq(rangeKeys.Newest()) {
                                return false, false, false, kvpb.NewWriteTooOldError(
                                        endTime, newestRangeKey.Next(), k.Key.Clone())
                        }
                        if !hasPointKey {
                                // landed on bare range key.
                                return true, false, true, nil
                        }
                        if k.Timestamp.Less(newestRangeKey) {
                                // The latest range tombstone shadows the point key; ok to continue run.
                                return true, false, true, nil
                        }
                }

                // At this point, there exists a point key that shadows all range keys,
                // if they exist.
                if endTime.LessEq(k.Timestamp) {
                        return false, false, false, kvpb.NewWriteTooOldError(endTime, k.Timestamp.Next(),
                                k.Key.Clone())
                }
                _, isTombstone, err := iter.MVCCValueLenAndIsTombstone()
                if err != nil {
                        return false, false, false, err
                }
                if isTombstone {
                        // The latest version of the key is a point tombstone.
                        return true, true, false, nil
                }

                // The latest key is a live point key. Conduct predicate filtering.
                if stop, err := stopRunBasedOnPredicate(k, iter); err != nil {
                        return false, false, false, err
                } else if stop {
                        return false, false, false, nil
                }

                return true, false, false, nil
        }

        // Create some reusable machinery for flushing a run with point tombstones
        // that is typically used in a single MVCCPut call.
        pointTombstoneIter, err := newMVCCIterator(
                ctx, rw, endTime, false /* rangeKeyMasking */, true, /* noInterleavedIntents */
                IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        Prefix:       true,
                        ReadCategory: fs.BatchEvalReadCategory,
                },
        )
        if err != nil {
                return nil, err
        }
        defer pointTombstoneIter.Close()

        ltScanner, err := newLockTableKeyScanner(
                ctx, rw, uuid.UUID{} /* txnID */, lock.Intent,
                maxLockConflicts, targetLockConflictBytes, fs.BatchEvalReadCategory,
        )
        if err != nil {
                return nil, err
        }
        defer ltScanner.close()

        pointTombstoneBuf := newPutBuffer()
        defer pointTombstoneBuf.release()

        flushDeleteKeys := func() error {
                if runSize == 0 {
                        return nil
                }
                if runSize >= rangeTombstoneThreshold ||
                        // Even if we didn't get a large enough number of keys to switch to
                        // using range tombstones, the byte size of the keys we did get is now too large to
                        // encode them all within the byte size limit, so use a range tombstone anyway.
                        batchByteSize+runByteSize >= maxBatchByteSize {
                        if err := MVCCDeleteRangeUsingTombstone(ctx, rw, ms,
                                runStart, runEnd.Next(), endTime, localTimestamp, leftPeekBound, rightPeekBound,
                                false /* idempotent */, maxLockConflicts, targetLockConflictBytes, nil); err != nil {
                                return err
                        }
                        batchByteSize += int64(MVCCRangeKey{StartKey: runStart, EndKey: runEnd, Timestamp: endTime}.EncodedSize())
                        batchSize++
                        keyAlloc.Truncate()
                } else {
                        // Use Point tombstones
                        for i := int64(0); i < runSize; i++ {
                                _, acq, err := mvccPutInternal(
                                        ctx, rw, pointTombstoneIter, ltScanner, buf[i], endTime, noValue, pointTombstoneBuf,
                                        nil, MVCCWriteOptions{
                                                LocalTimestamp: localTimestamp, Stats: ms, Category: fs.BatchEvalReadCategory},
                                )
                                if err != nil {
                                        return err
                                }
                                if !acq.Empty() {
                                        log.Fatal(ctx, "expected empty lock acquisition for non-transactional point delete")
                                }
                        }
                        batchByteSize += runByteSize
                        batchSize += runSize
                        keyAlloc = bufalloc.ByteAllocator{}
                }

                runSize = 0
                runStart = roachpb.Key{}
                runEnd = roachpb.Key{}
                buf = buf[:0]
                return nil
        }

        // Using the IncrementalIterator with the time-bound iter optimization could
        // potentially be a big win here -- the expected use-case for this is to run
        // over an entire table's span with a very recent timestamp, issuing tombstones to
        // writes of some failed IMPORT and that could very likely only have hit
        // some small subset of the table's keyspace.
        //
        // The MVCCIncrementalIterator uses a non-time-bound iter as its source
        // of truth, and only uses the TBI iterator as an optimization when finding
        // the next KV to iterate over. This pattern allows us to quickly skip over
        // swaths of uninteresting keys, but then iterates over the latest key of each MVCC key.
        //
        // Notice that the iterator's EndTime is set to hlc.MaxTimestamp, in order to
        // detect and fail on any keys written at or after the client provided
        // endTime. We don't _expect_ to hit intents or newer keys in the client
        // provided span since the MVCCPredicateDeleteRange is only intended for
        // non-live key spans, but there could be an intent leftover.
        iter, err := NewMVCCIncrementalIterator(ctx, rw, MVCCIncrementalIterOptions{
                EndKey:               endKey,
                StartTime:            predicates.StartTime,
                EndTime:              hlc.MaxTimestamp,
                RangeKeyMaskingBelow: endTime,
                KeyTypes:             IterKeyTypePointsAndRanges,
                ReadCategory:         fs.BatchEvalReadCategory,
        })
        if err != nil {
                return nil, err
        }
        defer iter.Close()

        iter.SeekGE(MVCCKey{Key: startKey})
        for {
                if ok, err := iter.Valid(); err != nil {
                        return nil, err
                } else if !ok {
                        break
                }
                k := iter.UnsafeKey()
                toContinue, isPointTombstone, isRangeTombstone, err := continueRun(k, iter)
                if err != nil {
                        return nil, err
                }

                // If the latest version of the key is a tombstone at a timestamp < endtime,
                // the timestamp could be less than predicates.startTime. In this case, the
                // run can continue and Since there's no need to issue another tombstone,
                // don't update runSize or buf.
                if isRangeTombstone {
                        // Because range key information can be inferred at point keys,
                        // skip over the surfaced range key, and reason about shadowed keys at
                        // the surfaced point key.
                        //
                        // E.g. Scanning the keys below:
                        //  2  a2
                        //  1  o---o
                        //     a   b
                        //
                        //  would result in two surfaced keys:
                        //   {a-b}@1;
                        //   a2, {a-b}@1
                        //
                        // Note that the range key gets surfaced before the point key,
                        // even though the point key shadows it.
                        iter.NextIgnoringTime()
                } else if isPointTombstone {
                        // Since the latest version of this key is a point tombstone, skip over
                        // older versions of this key, and move the iterator to the next key
                        // even if it lies outside (startTime, endTime), to see if there's a
                        // need to flush.
                        iter.NextKeyIgnoringTime()
                } else if toContinue {
                        // The latest version of the key is live, matches the predicate filters
                        // -- e.g. has a timestamp between (predicates.startTime, Endtime);
                        // therefore, plan to delete it.
                        if batchSize+runSize >= maxBatchSize || batchByteSize+runByteSize >= maxBatchByteSize {
                                // The matched key will be the start the resume span.
                                if err := flushDeleteKeys(); err != nil {
                                        return nil, err
                                }
                                return &roachpb.Span{Key: k.Key.Clone(), EndKey: endKey}, nil
                        }
                        if runSize == 0 {
                                runStart = append(runStart[:0], k.Key...)
                        }
                        runEnd = append(runEnd[:0], k.Key...)

                        if runSize < rangeTombstoneThreshold {
                                // Only buffer keys if there's a possibility of issuing point tombstones.
                                var keyCopy roachpb.Key
                                keyAlloc, keyCopy = keyAlloc.Copy(runEnd, 0)
                                buf = append(buf, keyCopy)
                        }

                        runSize++
                        runByteSize += int64(k.EncodedSize())

                        // Move the iterator to the next key in linear iteration even if it lies
                        // outside (startTime, endTime), to see if there's a need to flush. We can
                        // skip to the next key, as we don't care about older versions of the
                        // current key we're about to delete.
                        iter.NextKeyIgnoringTime()
                } else {

                        // This key does not match. Flush the run of matching keys,
                        // to prevent issuing tombstones on keys that do not match the predicates.
                        if err := flushDeleteKeys(); err != nil {
                                return nil, err
                        }
                        // Move the incremental iterator to the next valid MVCC key that can be
                        // deleted. If TBI was enabled when initializing the incremental iterator,
                        // this step could jump over large swaths of keys that do not qualify for
                        // clearing.
                        iter.NextKey()
                }
        }
        return nil, flushDeleteKeys()
}

// MVCCDeleteRangeUsingTombstone deletes the given MVCC keyspan at the given
// timestamp using an MVCC range tombstone (rather than MVCC point tombstones).
// This operation is non-transactional, but will check for existing intents and
// return a LockConflictError containing up to maxLockConflicts locks. Can't be
// used across local keyspace.
//
// The leftPeekBound and rightPeekBound parameters are used when looking for
// range tombstones that we'll merge or overlap with. These are provided to
// prevent the command from reading outside of the CRDB range bounds and latch
// bounds. nil means no bounds.
//
// If idempotent is true, the MVCC range tombstone will only be written if there
// exists any point keys/tombstones in the span that aren't already covered by
// an MVCC range tombstone. Notably, it will not write a tombstone across an
// empty span either.
//
// If msCovered is given, it must contain the current stats of the data that
// will be covered by the MVCC range tombstone. This avoids scanning across all
// point keys in the span, but will still do a time-bound scan to check for
// newer point keys that we conflict with.
//
// When deleting an entire Raft range, passing the current MVCCStats as
// msCovered and setting left/rightPeekBound to start/endKey will make the
// deletion significantly faster.
//
// TODO(sarkesian): Consider accepting MVCCWriteOptions for this function
// and its relevant callers.
func MVCCDeleteRangeUsingTombstone(
        ctx context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        startKey, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        localTimestamp hlc.ClockTimestamp,
        leftPeekBound, rightPeekBound roachpb.Key,
        idempotent bool,
        maxLockConflicts int64,
        targetLockConflictBytes int64,
        msCovered *enginepb.MVCCStats,
) error {
        // Validate the range key. We must do this first, to catch e.g. any bound violations.
        rangeKey := MVCCRangeKey{StartKey: startKey, EndKey: endKey, Timestamp: timestamp}
        if err := rangeKey.Validate(); err != nil {
                return err
        }

        // We currently don't allow MVCC range tombstones across the local keyspace,
        // to be safe. This wouldn't handle MVCC stats (SysBytes) correctly either.
        if startKey.Compare(keys.LocalMax) < 0 {
                return errors.AssertionFailedf("can't write MVCC range tombstone across local keyspan %s",
                        rangeKey)
        }

        // Encode the value.
        var value MVCCValue
        value.LocalTimestamp = localTimestamp
        if !value.LocalTimestampNeeded(timestamp) || !rw.ShouldWriteLocalTimestamps(ctx) {
                value.LocalTimestamp = hlc.ClockTimestamp{}
        }
        if buildutil.CrdbTestBuild {
                if seq, ok := kvnemesisutil.FromContext(ctx); ok {
                        value.KVNemesisSeq.Set(seq)
                }
        }
        valueRaw, err := EncodeMVCCValue(value)
        if err != nil {
                return err
        }

        // Check for any overlapping locks, and return them to be resolved.
        if locks, err := ScanLocks(
                ctx, rw, startKey, endKey, maxLockConflicts, targetLockConflictBytes); err != nil {
                return err
        } else if len(locks) > 0 {
                return &kvpb.LockConflictError{Locks: locks}
        }

        // If requested, check if there are any point keys/tombstones in the span that
        // aren't already covered by MVCC range tombstones. Also check for conflicts
        // with newer MVCC range tombstones.
        if idempotent {
                if noPointKeys, err := func() (bool, error) {
                        iter, err := rw.NewMVCCIterator(ctx, MVCCKeyIterKind, IterOptions{
                                KeyTypes:             IterKeyTypePointsAndRanges,
                                LowerBound:           startKey,
                                UpperBound:           endKey,
                                RangeKeyMaskingBelow: timestamp,
                                ReadCategory:         fs.BatchEvalReadCategory,
                        })
                        if err != nil {
                                return false, err
                        }
                        defer iter.Close()
                        for iter.SeekGE(MVCCKey{Key: startKey}); ; iter.Next() {
                                if ok, err := iter.Valid(); err != nil {
                                        return false, err
                                } else if !ok {
                                        break
                                }
                                if hasPoint, _ := iter.HasPointAndRange(); hasPoint {
                                        return false, nil
                                } else if newest := iter.RangeKeys().Newest(); timestamp.LessEq(newest) {
                                        return false, kvpb.NewWriteTooOldError(timestamp, newest.Next(), iter.RangeBounds().Key)
                                }
                        }
                        return true, nil
                }(); err != nil || noPointKeys {
                        return err
                }
        }

        // If we're omitting point keys in the stats/conflict scan below, we need to
        // do a separate time-bound scan for point key conflicts.
        //
        // We can disable intent interleaving, since we've already scanned for locks.
        if msCovered != nil {
                if err := func() error {
                        iter, err := NewMVCCIncrementalIterator(ctx, rw, MVCCIncrementalIterOptions{
                                KeyTypes:     IterKeyTypePointsOnly,
                                IntentPolicy: MVCCIncrementalIterIntentPolicyIgnore,
                                StartKey:     startKey,
                                EndKey:       endKey,
                                StartTime:    timestamp.Prev(), // make inclusive
                                ReadCategory: fs.BatchEvalReadCategory,
                        })
                        if err != nil {
                                return err
                        }
                        defer iter.Close()
                        iter.SeekGE(MVCCKey{Key: startKey})
                        if ok, err := iter.Valid(); err != nil {
                                return err
                        } else if ok {
                                key := iter.UnsafeKey()
                                return kvpb.NewWriteTooOldError(timestamp, key.Timestamp.Next(), key.Key)
                        }
                        return nil
                }(); err != nil {
                        return err
                }
        }

        // Scan for conflicts and MVCC stats updates. We can omit point keys from the
        // scan if stats are already known for the live data.
        iterOpts := IterOptions{
                KeyTypes:             IterKeyTypePointsAndRanges,
                LowerBound:           startKey,
                UpperBound:           endKey,
                RangeKeyMaskingBelow: timestamp, // lower point keys have already been accounted for
                ReadCategory:         fs.BatchEvalReadCategory,
        }
        if msCovered != nil {
                iterOpts.KeyTypes = IterKeyTypeRangesOnly
                iterOpts.RangeKeyMaskingBelow = hlc.Timestamp{}
        }
        iter, err := rw.NewMVCCIterator(ctx, MVCCKeyIterKind, iterOpts)
        if err != nil {
                return err
        }
        defer iter.Close()

        iter.SeekGE(MVCCKey{Key: startKey})
        prevRangeEnd := startKey.Clone()
        for {
                if ok, err := iter.Valid(); err != nil {
                        return err
                } else if !ok {
                        break
                }

                // Process range keys.
                if iter.RangeKeyChanged() {
                        hasPoint, hasRange := iter.HasPointAndRange()
                        if hasRange {
                                rangeKeys := iter.RangeKeys()
                                if timestamp.LessEq(rangeKeys.Newest()) {
                                        return kvpb.NewWriteTooOldError(timestamp, rangeKeys.Newest().Next(),
                                                rangeKeys.Bounds.Key)
                                }

                                if ms != nil {
                                        // If the encountered range key does not abut the previous range key,
                                        // we'll write a new range key fragment in the gap between them.
                                        if !rangeKeys.Bounds.Key.Equal(prevRangeEnd) {
                                                ms.Add(updateStatsOnRangeKeyPut(MVCCRangeKeyStack{
                                                        Bounds:   roachpb.Span{Key: prevRangeEnd, EndKey: rangeKeys.Bounds.Key},
                                                        Versions: MVCCRangeKeyVersions{{Timestamp: timestamp, Value: valueRaw}},
                                                }))
                                        }
                                        ms.Add(updateStatsOnRangeKeyPutVersion(rangeKeys,
                                                MVCCRangeKeyVersion{Timestamp: timestamp, Value: valueRaw}))
                                }

                                prevRangeEnd = append(prevRangeEnd[:0], rangeKeys.Bounds.EndKey...)
                        }

                        // If we hit a bare range key, it's possible that there's a point key on the
                        // same key as its start key, so take a normal step to look for it.
                        if !hasPoint {
                                iter.Next()
                                continue
                        }
                }

                // Process point key.
                key := iter.UnsafeKey()
                if timestamp.LessEq(key.Timestamp) {
                        return kvpb.NewWriteTooOldError(timestamp, key.Timestamp.Next(), key.Key)
                }
                if key.Timestamp.IsEmpty() {
                        return errors.Errorf("can't write range tombstone across inline key %s", key)
                }
                if ms != nil {
                        valueLen, isTombstone, err := iter.MVCCValueLenAndIsTombstone()
                        if err != nil {
                                return err
                        }
                        ms.Add(updateStatsOnRangeKeyCover(timestamp, key, valueLen, isTombstone))
                }
                iter.NextKey()
        }

        // Once we've iterated across the range key span, fill in the final gap
        // between the previous existing range key fragment and the end of the range
        // key if any. If no existing fragments were found during iteration above,
        // this will be the entire new range key.
        if ms != nil && !prevRangeEnd.Equal(endKey) {
                ms.Add(updateStatsOnRangeKeyPut(MVCCRangeKeyStack{
                        Bounds:   roachpb.Span{Key: prevRangeEnd, EndKey: endKey},
                        Versions: MVCCRangeKeyVersions{{Timestamp: timestamp, Value: valueRaw}},
                }))
        }

        // Check if the range key will merge with or fragment any existing range keys
        // at the bounds, and adjust stats accordingly.
        if ms != nil && (!leftPeekBound.Equal(startKey) || !rightPeekBound.Equal(endKey)) {
                if rightPeekBound == nil {
                        rightPeekBound = keys.MaxKey
                }
                rkIter, err := rw.NewMVCCIterator(ctx, MVCCKeyIterKind, IterOptions{
                        KeyTypes:     IterKeyTypeRangesOnly,
                        LowerBound:   leftPeekBound,
                        UpperBound:   rightPeekBound,
                        ReadCategory: fs.BatchEvalReadCategory,
                })
                if err != nil {
                        return err
                }
                defer rkIter.Close()

                // Peek to the left.
                if cmp, lhs, err := PeekRangeKeysLeft(rkIter, startKey); err != nil {
                        return err

                } else if cmp > 0 {
                        // We're fragmenting an existing range key.
                        ms.Add(UpdateStatsOnRangeKeySplit(startKey, lhs.Versions))

                } else if cmp == 0 {
                        // We may be merging with an existing range key to the left, possibly
                        // along with an existing stack below us.
                        lhs = lhs.Clone()
                        rhs := rangeKey.AsStack(valueRaw)
                        if cmp, below, err := PeekRangeKeysRight(rkIter, startKey); err != nil {
                                return err
                        } else if cmp == 0 {
                                rhs.Versions = append(rhs.Versions, below.Versions...)
                        }
                        if lhs.CanMergeRight(rhs) {
                                ms.Add(updateStatsOnRangeKeyMerge(startKey, rhs.Versions))
                        }
                }

                // Peek to the right.
                if cmp, rhs, err := PeekRangeKeysRight(rkIter, endKey); err != nil {
                        return err

                } else if cmp < 0 {
                        // We're fragmenting an existing range key.
                        ms.Add(UpdateStatsOnRangeKeySplit(endKey, rhs.Versions))

                } else if cmp == 0 {
                        // We may be merging with an existing range key to the right, possibly
                        // along with an existing stack below us.
                        lhs := rangeKey.AsStack(valueRaw)
                        rhs = rhs.Clone()
                        if cmp, below, err := PeekRangeKeysLeft(rkIter, endKey); err != nil {
                                return err
                        } else if cmp == 0 {
                                lhs.Versions = append(lhs.Versions, below.Versions...)
                        }
                        if lhs.CanMergeRight(rhs) {
                                ms.Add(updateStatsOnRangeKeyMerge(endKey, rhs.Versions))
                        }
                }
        }

        // If we're given MVCC stats for the covered data, mark it as deleted at the
        // current timestamp.
        if ms != nil && msCovered != nil {
                ms.Add(updateStatsOnRangeKeyCoverStats(timestamp, *msCovered))
        }

        if err := rw.PutMVCCRangeKey(rangeKey, value); err != nil {
                return err
        }

        rw.LogLogicalOp(MVCCDeleteRangeOpType, MVCCLogicalOpDetails{
                Safe:      true,
                Key:       rangeKey.StartKey,
                EndKey:    rangeKey.EndKey,
                Timestamp: rangeKey.Timestamp,
        })

        return nil
}

type iteratorWithStats interface {
        // Stats returns statistics about the iterator.
        Stats() IteratorStats
}

// recordIteratorStats updates the provided ScanStats (which is assumed to be
// non-nil) with the MVCC stats from iter.
func recordIteratorStats(iter iteratorWithStats, scanStats *kvpb.ScanStats) {
        iteratorStats := iter.Stats()
        stats := &iteratorStats.Stats
        steps := stats.ReverseStepCount[pebble.InterfaceCall] + stats.ForwardStepCount[pebble.InterfaceCall]
        seeks := stats.ReverseSeekCount[pebble.InterfaceCall] + stats.ForwardSeekCount[pebble.InterfaceCall]
        internalSteps := stats.ReverseStepCount[pebble.InternalIterCall] + stats.ForwardStepCount[pebble.InternalIterCall]
        internalSeeks := stats.ReverseSeekCount[pebble.InternalIterCall] + stats.ForwardSeekCount[pebble.InternalIterCall]
        scanStats.NumInterfaceSeeks += uint64(seeks)
        scanStats.NumInternalSeeks += uint64(internalSeeks)
        scanStats.NumInterfaceSteps += uint64(steps)
        scanStats.NumInternalSteps += uint64(internalSteps)
        scanStats.BlockBytes += stats.InternalStats.BlockBytes
        scanStats.BlockBytesInCache += stats.InternalStats.BlockBytesInCache
        scanStats.KeyBytes += stats.InternalStats.KeyBytes
        scanStats.ValueBytes += stats.InternalStats.ValueBytes
        scanStats.PointCount += stats.InternalStats.PointCount
        scanStats.PointsCoveredByRangeTombstones += stats.InternalStats.PointsCoveredByRangeTombstones
        scanStats.RangeKeyCount += uint64(stats.RangeKeyStats.Count)
        scanStats.RangeKeyContainedPoints += uint64(stats.RangeKeyStats.ContainedPoints)
        scanStats.RangeKeySkippedPoints += uint64(stats.RangeKeyStats.SkippedPoints)
        scanStats.SeparatedPointCount += stats.InternalStats.SeparatedPointValue.Count
        scanStats.SeparatedPointValueBytes += stats.InternalStats.SeparatedPointValue.ValueBytes
        scanStats.SeparatedPointValueBytesFetched += stats.InternalStats.SeparatedPointValue.ValueBytesFetched
        scanStats.BlockReadDuration += stats.InternalStats.BlockReadDuration
}

// mvccScanInit performs some preliminary checks on the validity of options for
// a scan.
//
// If ok=true is returned, then the pebbleMVCCScanner must be release()'d when
// no longer needed. The scanner is initialized with the given results.
//
// If ok=false is returned, then the returned result and the error are the
// result of the scan.
func mvccScanInit(
        mvccScanner *pebbleMVCCScanner,
        iter MVCCIterator,
        key, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCScanOptions,
        results results,
) (ok bool, _ MVCCScanResult, _ error) {
        if len(endKey) == 0 {
                return false, MVCCScanResult{}, emptyKeyError()
        }
        if err := opts.validate(); err != nil {
                return false, MVCCScanResult{}, err
        }
        if opts.MaxKeys < 0 {
                return false, MVCCScanResult{
                        ResumeSpan:   &roachpb.Span{Key: key, EndKey: endKey},
                        ResumeReason: kvpb.RESUME_KEY_LIMIT,
                }, nil
        }
        if opts.TargetBytes < 0 {
                return false, MVCCScanResult{
                        ResumeSpan:   &roachpb.Span{Key: key, EndKey: endKey},
                        ResumeReason: kvpb.RESUME_BYTE_LIMIT,
                }, nil
        }

        memAccount := mvccScanner.memAccount
        if opts.MemoryAccount != nil {
                memAccount = opts.MemoryAccount
        }
        *mvccScanner = pebbleMVCCScanner{
                parent:           iter,
                memAccount:       memAccount,
                unlimitedMemAcc:  mvccScanner.unlimitedMemAcc,
                lockTable:        opts.LockTable,
                reverse:          opts.Reverse,
                start:            key,
                end:              endKey,
                ts:               timestamp,
                maxKeys:          opts.MaxKeys,
                targetBytes:      opts.TargetBytes,
                allowEmpty:       opts.AllowEmpty,
                rawMVCCValues:    opts.ReturnRawMVCCValues,
                wholeRows:        opts.WholeRowsOfSize > 1, // single-KV rows don't need processing
                maxLockConflicts: opts.MaxLockConflicts,
                inconsistent:     opts.Inconsistent,
                skipLocked:       opts.SkipLocked,
                tombstones:       opts.Tombstones,
                failOnMoreRecent: opts.FailOnMoreRecent,
                keyBuf:           mvccScanner.keyBuf,
                // NB: If the `results` argument passed to this function is a pointer to
                // mvccScanner.alloc.pebbleResults, we don't want to overwrite any
                // initialization of the pebbleResults struct performed by the caller.
                // The struct should not contain any stale buffers from previous uses,
                // because pebbleMVCCScanner.release zeros it.
                alloc: mvccScanner.alloc,
        }

        mvccScanner.init(opts.Txn, opts.Uncertainty, results)
        return true /* ok */, MVCCScanResult{}, nil
}

func mvccScanToBytes(
        ctx context.Context,
        iter MVCCIterator,
        key, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCScanOptions,
) (MVCCScanResult, error) {
        mvccScanner := pebbleMVCCScannerPool.Get().(*pebbleMVCCScanner)
        results := &mvccScanner.alloc.pebbleResults
        *results = pebbleResults{}
        if opts.WholeRowsOfSize > 1 {
                results.lastOffsetsEnabled = true
                results.lastOffsets = make([]int, opts.WholeRowsOfSize)
        }
        ok, res, err := mvccScanInit(mvccScanner, iter, key, endKey, timestamp, opts, results)
        if !ok {
                return res, err
        }
        defer mvccScanner.release()

        res.ResumeSpan, res.ResumeReason, res.ResumeNextBytes, err = mvccScanner.scan(ctx)

        if err != nil {
                return MVCCScanResult{}, err
        }

        res.KVData = results.finish()
        if err = finalizeScanResult(mvccScanner, &res, opts); err != nil {
                return MVCCScanResult{}, err
        }
        return res, nil
}

// finalizeScanResult updates the MVCCScanResult in-place after the scan was
// completed successfully. It also performs some additional auxiliary tasks
// (like recording iterators stats).
func finalizeScanResult(
        mvccScanner *pebbleMVCCScanner, res *MVCCScanResult, opts MVCCScanOptions,
) error {
        res.NumKeys, res.NumBytes, _ = mvccScanner.results.sizeInfo(0 /* lenKey */, 0 /* lenValue */)

        // If we're tracking the ScanStats, include the stats from this Scan /
        // ReverseScan.
        if opts.ScanStats != nil {
                recordIteratorStats(mvccScanner.parent, opts.ScanStats)
                if opts.Reverse {
                        opts.ScanStats.NumReverseScans++
                } else {
                        opts.ScanStats.NumScans++
                }
        }

        var err error
        res.Intents, err = buildScanIntents(mvccScanner.intentsRepr())
        if err != nil {
                return err
        }

        if opts.errOnIntents() && len(res.Intents) > 0 {
                return &kvpb.LockConflictError{Locks: roachpb.AsLocks(res.Intents)}
        }
        return nil
}

// mvccScanToKvs converts the raw key/value pairs returned by MVCCIterator.MVCCScan
// into a slice of roachpb.KeyValues.
func mvccScanToKvs(
        ctx context.Context,
        iter MVCCIterator,
        key, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCScanOptions,
) (MVCCScanResult, error) {
        res, err := mvccScanToBytes(ctx, iter, key, endKey, timestamp, opts)
        if err != nil {
                return MVCCScanResult{}, err
        }
        res.KVs = make([]roachpb.KeyValue, res.NumKeys)
        kvData := res.KVData
        res.KVData = nil

        var i int
        if err := MVCCScanDecodeKeyValues(kvData, func(key MVCCKey, rawBytes []byte) error {
                res.KVs[i].Key = key.Key
                res.KVs[i].Value.RawBytes = rawBytes
                res.KVs[i].Value.Timestamp = key.Timestamp
                i++
                return nil
        }); err != nil {
                return MVCCScanResult{}, err
        }
        return res, err
}

func buildScanIntents(data []byte) ([]roachpb.Intent, error) {
        if len(data) == 0 {
                return nil, nil
        }

        reader, err := NewBatchReader(data)
        if err != nil {
                return nil, err
        }

        intents := make([]roachpb.Intent, 0, reader.Count())
        var meta enginepb.MVCCMetadata
        for reader.Next() {
                key, err := reader.MVCCKey()
                if err != nil {
                        return nil, err
                }
                if err := protoutil.Unmarshal(reader.Value(), &meta); err != nil {
                        return nil, err
                }
                if meta.Txn == nil {
                        return nil, errors.AssertionFailedf("unexpected nil MVCCMetadata.Txn: %v", meta)
                }
                intents = append(intents, roachpb.MakeIntent(meta.Txn, key.Key))
        }

        if err := reader.Error(); err != nil {
                return nil, err
        }
        return intents, nil
}

// MVCCWriteOptions bundles options for the MVCCPut and MVCCDelete families of
// functions.
type MVCCWriteOptions struct {
        // See the comment on mvccPutInternal for details on these parameters.
        Txn                            *roachpb.Transaction
        LocalTimestamp                 hlc.ClockTimestamp
        Stats                          *enginepb.MVCCStats
        ReplayWriteTimestampProtection bool
        OmitInRangefeeds               bool
        ImportEpoch                    uint32
        // OriginID, when set during Logical Data Replication, will bind to the
        // putting key's MVCCValueHeader.
        OriginID uint32
        // OriginTimestamp, when set during Logical Data Replication, will bind to the
        // putting key's MVCCValueHeader.
        OriginTimestamp hlc.Timestamp
        // MaxLockConflicts is a maximum number of conflicting locks collected before
        // returning LockConflictError. Even single-key writes can encounter multiple
        // conflicting shared locks, so the limit is important to bound the number of
        // locks returned.
        //
        // The zero value indicates no limit.
        MaxLockConflicts int64
        // TargetLockConflictBytes is the number of bytes returned in LockConflictError.
        // The process will stop collecting intents when total size is exceeds the threshold.
        //
        // The zero value indicates no limit.
        TargetLockConflictBytes int64
        // Category is used for writes that need to do a read.
        Category fs.ReadCategory
}

func (opts *MVCCWriteOptions) validate() error {
        if opts.ReplayWriteTimestampProtection && opts.Txn == nil {
                return errors.Errorf("cannot enable replay protection without a transaction")
        }
        return nil
}

// TxnID returns the transaction ID if the write corresponds to a transactional
// write. Otherwise, if it corresponds to a non-transactional write, an empty ID
// is returned.
func (opts *MVCCWriteOptions) TxnID() uuid.UUID {
        if opts.Txn != nil {
                return opts.Txn.ID
        }
        return uuid.UUID{}
}

// MVCCScanOptions bundles options for the MVCCScan family of functions.
type MVCCScanOptions struct {
        // See the documentation for MVCCScan for information on these parameters.
        Inconsistent     bool
        SkipLocked       bool
        Tombstones       bool
        Reverse          bool
        FailOnMoreRecent bool
        Txn              *roachpb.Transaction
        ScanStats        *kvpb.ScanStats
        Uncertainty      uncertainty.Interval
        // MaxKeys is the maximum number of kv pairs returned from this operation.
        // The zero value represents an unbounded scan. If the limit stops the scan,
        // a corresponding ResumeSpan is returned. As a special case, the value -1
        // returns no keys in the result (returning the first key via the
        // ResumeSpan).
        MaxKeys int64
        // TargetBytes is a byte threshold to limit the amount of data pulled into
        // memory during a Scan operation. Once the target is satisfied (i.e. met or
        // exceeded) by the emitted KV pairs, iteration stops (with a ResumeSpan as
        // appropriate). In particular, at least one kv pair is returned (when one
        // exists), unless AllowEmpty is set.
        //
        // The number of bytes a particular kv pair accrues depends on internal data
        // structures, but it is guaranteed to exceed that of the bytes stored in
        // the key and value itself.
        //
        // The zero value indicates no limit.
        TargetBytes int64
        // AllowEmpty will return an empty result if the first kv pair exceeds the
        // TargetBytes limit.
        AllowEmpty bool
        // WholeRowsOfSize will prevent returning partial rows when limits (MaxKeys or
        // TargetBytes) are set. The value indicates the max number of keys per row.
        // If the last KV pair(s) belong to a partial row, they will be removed from
        // the result -- except if the result only consists of a single partial row
        // and AllowEmpty is false, in which case the remaining KV pairs of the row
        // will be fetched and returned too.
        WholeRowsOfSize int32
        // MaxLockConflicts is a maximum number of locks (intents) collected by
        // scanner in consistent mode before returning LockConflictError.
        //
        // Not used in inconsistent scans.
        // The zero value indicates no limit.
        MaxLockConflicts int64
        // TargetLockConflictBytes sets target bytes for collected intents with
        // LockConflictError. This setting will stop collecting intents when total intent
        // size exceeding the target threshold. This setting only work under
        // MVCCIncrementalIterIntentPolicyAggregate. Caller must call TryGetIntentError
        // even when the total collected intents size is less than the threshold.
        //
        // The zero value indicates no limit.
        TargetLockConflictBytes int64
        // MemoryAccount is used for tracking memory allocations.
        MemoryAccount *mon.BoundAccount
        // LockTable is used to determine whether keys are locked in the in-memory
        // lock table when scanning with the SkipLocked option.
        LockTable LockTableView
        // DontInterleaveIntents, when set, makes it such that intent metadata is not
        // interleaved with the results of the scan. Setting this option means that
        // the underlying pebble iterator will only scan over the MVCC keyspace and
        // will not use an `intentInterleavingIter`. It is only appropriate to use
        // this when the caller does not need to know whether a given key is an intent
        // or not. It is usually set by read-only requests that have resolved their
        // conflicts before they begin their MVCC scan.
        DontInterleaveIntents bool
        // ReadCategory is used to map to a user-understandable category string, for
        // stats aggregation and metrics, and a Pebble-understandable QoS.
        ReadCategory fs.ReadCategory
        // ReturnRawMVCCValues indicates that the scan should return
        // roachpb.Value whose RawBytes may contain MVCCValueHeader
        // data.
        ReturnRawMVCCValues bool
}

func (opts *MVCCScanOptions) validate() error {
        if opts.Inconsistent && opts.Txn != nil {
                return errors.Errorf("cannot allow inconsistent reads within a transaction")
        }
        if opts.Inconsistent && opts.SkipLocked {
                return errors.Errorf("cannot allow inconsistent reads with skip locked option")
        }
        if opts.Inconsistent && opts.FailOnMoreRecent {
                return errors.Errorf("cannot allow inconsistent reads with fail on more recent option")
        }
        if opts.DontInterleaveIntents && opts.SkipLocked {
                return errors.Errorf("cannot disable interleaved intents with skip locked option")
        }
        return nil
}

func (opts *MVCCScanOptions) errOnIntents() bool {
        return !opts.Inconsistent && !opts.SkipLocked
}

// MVCCScanResult groups the values returned from an MVCCScan operation.
// Depending on the operation invoked, only one of KVData, ColBatches, or KVs is
// populated.
type MVCCScanResult struct {
        KVData     [][]byte
        ColBatches []coldata.Batch
        KVs        []roachpb.KeyValue
        NumKeys    int64
        // NumBytes is the number of bytes this scan result accrued in terms of the
        // MVCCScanOptions.TargetBytes parameter. This roughly measures the bytes
        // used for encoding the uncompressed kv pairs contained in the result.
        NumBytes int64

        ResumeSpan      *roachpb.Span
        ResumeReason    kvpb.ResumeReason
        ResumeNextBytes int64 // populated if TargetBytes != 0, size of next resume kv
        Intents         []roachpb.Intent
}

// MVCCScan scans the key range [key, endKey) in the provided reader up to some
// maximum number of results in ascending order. If it hits max, it returns a
// "resume span" to be used in the next call to this function. If the limit is
// not hit, the resume span will be nil. Otherwise, it will be the sub-span of
// [key, endKey) that has not been scanned.
//
// For an unbounded scan, specify a max of zero.
//
// Only keys that with a timestamp less than or equal to the supplied timestamp
// will be included in the scan results. If a transaction is provided and the
// scan encounters a value with a timestamp between the supplied timestamp and
// the transaction's global uncertainty limit, an uncertainty error will be
// returned. This window of uncertainty is reduced down to the local uncertainty
// limit, if one is provided.
//
// In tombstones mode, if the most recent value for a key is a deletion
// tombstone, the scan result will contain a roachpb.KeyValue for that key whose
// RawBytes field is nil. Otherwise, the key-value pair will be omitted from the
// result entirely. MVCC range tombstones will be emitted as synthetic point
// tombstones above existing point keys, but not below them and not if they
// don't overlap any point keys at all. This is unlike MVCCGet, which will
// always synthesize point tombstones if the key overlaps a range tombstone,
// regardless of whether a point key exists below it.
//
// When scanning inconsistently, any encountered intents will be placed in the
// dedicated result parameter. By contrast, when scanning consistently, any
// encountered intents will cause the scan to return a LockConflictError with the
// intents embedded within.
//
// Note that transactional scans must be consistent. Put another way, only
// non-transactional scans may be inconsistent.
//
// When scanning in "skip locked" mode, keys that are locked by transactions
// other than the reader are not included in the result set and do not result in
// a LockConflictError. Instead, these keys are included in the encountered
// intents result parameter so that they can be resolved asynchronously. In this
// mode, the LockTableView provided in the options is consulted for each key to
// determine whether it is locked with an unreplicated lock.
//
// When scanning in "fail on more recent" mode, a WriteTooOldError will be
// returned if the scan observes a version with a timestamp at or above the read
// timestamp. If the scan observes multiple versions with timestamp at or above
// the read timestamp, the maximum will be returned in the WriteTooOldError.
// Similarly, a LockConflictError will be returned if the scan observes another
// transaction's intent, even if it has a timestamp above the read timestamp.
func MVCCScan(
        ctx context.Context,
        reader Reader,
        key, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCScanOptions,
) (MVCCScanResult, error) {
        iter, err := newMVCCIterator(
                ctx, reader, timestamp, !opts.Tombstones, opts.DontInterleaveIntents, IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        LowerBound:   key,
                        UpperBound:   endKey,
                        ReadCategory: opts.ReadCategory,
                },
        )
        if err != nil {
                return MVCCScanResult{}, err
        }
        defer iter.Close()
        return mvccScanToKvs(ctx, iter, key, endKey, timestamp, opts)
}

// MVCCScanToBytes is like MVCCScan, but it returns the results in a byte array.
func MVCCScanToBytes(
        ctx context.Context,
        reader Reader,
        key, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCScanOptions,
) (MVCCScanResult, error) {
        iter, err := newMVCCIterator(
                ctx, reader, timestamp, !opts.Tombstones, opts.DontInterleaveIntents, IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        LowerBound:   key,
                        UpperBound:   endKey,
                        ReadCategory: opts.ReadCategory,
                },
        )
        if err != nil {
                return MVCCScanResult{}, err
        }
        defer iter.Close()
        return mvccScanToBytes(ctx, iter, key, endKey, timestamp, opts)
}

// MVCCScanAsTxn constructs a temporary transaction from the given transaction
// metadata and calls MVCCScan as that transaction. This method is required only
// for reading intents of a transaction when only its metadata is known and
// should rarely be used.
//
// The read is carried out without the chance of uncertainty restarts.
func MVCCScanAsTxn(
        ctx context.Context,
        reader Reader,
        key, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        txnMeta enginepb.TxnMeta,
) (MVCCScanResult, error) {
        return MVCCScan(ctx, reader, key, endKey, timestamp, MVCCScanOptions{
                Txn: &roachpb.Transaction{
                        TxnMeta:                txnMeta,
                        Status:                 roachpb.PENDING,
                        ReadTimestamp:          txnMeta.WriteTimestamp,
                        GlobalUncertaintyLimit: txnMeta.WriteTimestamp,
                }})
}

// MVCCIterate iterates over the key range [start,end). At each step of the
// iteration, f() is invoked with the current key/value pair. If f returns
// iterutil.StopIteration, the iteration stops with no error propagated. If f
// returns any other error, the iteration stops and the error is propagated. If
// the reverse flag is set, the iterator will be moved in reverse order. If the
// scan options specify an inconsistent scan, all "ignored" intents will be
// returned. In consistent mode, intents are only ever returned as part of a
// LockConflictError. In Tombstones mode, MVCC range tombstones are emitted as
// synthetic point tombstones above existing point keys.
func MVCCIterate(
        ctx context.Context,
        reader Reader,
        key, endKey roachpb.Key,
        timestamp hlc.Timestamp,
        opts MVCCScanOptions,
        f func(roachpb.KeyValue) error,
) ([]roachpb.Intent, error) {
        iter, err := newMVCCIterator(
                ctx, reader, timestamp, !opts.Tombstones, opts.DontInterleaveIntents, IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        LowerBound:   key,
                        UpperBound:   endKey,
                        ReadCategory: opts.ReadCategory,
                },
        )
        if err != nil {
                return nil, err
        }
        defer iter.Close()

        var intents []roachpb.Intent
        for {
                const maxKeysPerScan = 1000
                opts := opts
                opts.MaxKeys = maxKeysPerScan
                res, err := mvccScanToKvs(
                        ctx, iter, key, endKey, timestamp, opts)
                if err != nil {
                        return nil, err
                }

                if len(res.Intents) > 0 {
                        if intents == nil {
                                intents = res.Intents
                        } else {
                                intents = append(intents, res.Intents...)
                        }
                }

                for i := range res.KVs {
                        if err := f(res.KVs[i]); err != nil {
                                err = iterutil.Map(err)
                                return intents, err
                        }
                }

                if res.ResumeSpan == nil {
                        break
                }
                if opts.Reverse {
                        endKey = res.ResumeSpan.EndKey
                } else {
                        key = res.ResumeSpan.Key
                }
        }

        return intents, nil
}

// MVCCPaginate iteratively invokes f() with the current maxKeys and
// targetBytes limits. If f returns iterutil.StopIteration (meaning that we
// have iterated through all elements), the iteration stops with no error
// propagated. If f returns any other error, the iteration stops and the error
// is propagated. If the number of keys hits the maxKeys limit or the number of
// bytes hits the targetBytes limit, the iteration stops with no error
// propagated but with the appropriate resume reason returned. f returns a
// resumeReason, which if set, will assert that the number of keys / bytes hit
// the key / byte limit matching the resumeReason. Moreover, if resumeReason is
// RESUME_BYTE_LIMIT and allowEmpty is true, then the iteration stops with no
// error propagated but with the RESUME_BYTE_LIMIT resume reason returned.
//
// We note that it is up to f() whether it wants to allow the numBytes to
// exceed targetBytes by up to one entry or whether it wants to terminate
// iteration before numBytes exceeds targetBytes. See the AllowEmpty option.
func MVCCPaginate(
        ctx context.Context,
        maxKeys, targetBytes int64,
        allowEmpty bool,
        f func(maxKeys, targetBytes int64) (numKeys, numBytes int64, resumeReason kvpb.ResumeReason, err error),
) (numKeys, numBytes int64, resumeReason kvpb.ResumeReason, err error) {
        for {
                if maxKeys < 0 {
                        return numKeys, numBytes, kvpb.RESUME_KEY_LIMIT, nil
                }
                if targetBytes < 0 {
                        return numKeys, numBytes, kvpb.RESUME_BYTE_LIMIT, nil
                }
                addedKeys, addedBytes, resumeReason, err := f(maxKeys, targetBytes)
                if err != nil {
                        if addedKeys != 0 || addedBytes != 0 || resumeReason != 0 {
                                log.Fatalf(ctx,
                                        "addedKeys, addedBytes, and resumeReason should all be 0, but got addedKeys=%d, addedBytes=%d, resumeReason=%d",
                                        addedKeys, addedBytes, resumeReason)
                        }
                        err = iterutil.Map(err)
                        return numKeys, numBytes, 0, err
                }
                numKeys += addedKeys
                numBytes += addedBytes
                if maxKeys > 0 {
                        if addedKeys > maxKeys {
                                log.Fatalf(ctx, "added %d keys, which exceeds the max key limit %d", addedKeys, maxKeys)
                        } else if addedKeys < maxKeys {
                                maxKeys -= addedKeys
                        } else {
                                maxKeys = -1
                        }
                }
                if targetBytes > 0 {
                        if addedBytes < targetBytes {
                                targetBytes -= addedBytes
                        } else {
                                targetBytes = -1
                        }
                }
                switch resumeReason {
                case kvpb.RESUME_KEY_LIMIT:
                        if maxKeys >= 0 {
                                log.Fatalf(ctx, "Resume reason RESUME_KEY_LIMIT, but key limit = %d has not been hit", maxKeys)
                        }
                case kvpb.RESUME_BYTE_LIMIT:
                        if !allowEmpty && targetBytes >= 0 {
                                log.Fatalf(ctx, "Resume reason RESUME_BYTE_LIMIT, but byte limit = %d has not been hit", targetBytes)
                        }
                        targetBytes = -1
                case 0:
                default:
                        log.Fatalf(ctx, "Resume reason must be RESUME_KEY_LIMIT, RESUME_BYTE_LIMIT, or 0, got resumeReason = %d", resumeReason)
                }
        }
}

// MVCCResolveWriteIntent either commits, aborts (rolls back), or moves forward
// in time an extant write intent for a given txn according to commit
// parameter. ResolveWriteIntent will skip write intents of other txns.
//
// An opts.TargetBytes of < 0 means resolve nothing and returns the intent as
// the resume span. If opts.TargetBytes >= 0, then resolve intent and resume
// span is nil.
//
// Returns whether or not an intent was found to resolve, number of bytes added
// to the write batch by intent resolution, and the resume span if the max
// bytes limit was exceeded. Additionally, if any replicated locks with strength
// lock.Shared or lock.Exclusive are released, a boolean indicating as such is
// also returned.
//
// Transaction epochs deserve a bit of explanation. The epoch for a
// transaction is incremented on transaction retries. A transaction
// retry is different from an abort. Retries can occur in SSI
// transactions when the commit timestamp is not equal to the proposed
// transaction timestamp. On a retry, the epoch is incremented instead
// of creating an entirely new transaction. This allows the intents
// that were written on previous runs to serve as locks which prevent
// concurrent reads from further incrementing the timestamp cache,
// making further transaction retries less likely.
//
// Because successive retries of a transaction may end up writing to
// different keys, the epochs serve to classify which intents get
// committed in the event the transaction succeeds (all those with
// epoch matching the commit epoch), and which intents get aborted,
// even if the transaction succeeds.
func MVCCResolveWriteIntent(
        ctx context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        update roachpb.LockUpdate,
        opts MVCCResolveWriteIntentOptions,
) (ok bool, numBytes int64, resumeSpan *roachpb.Span, replLocksReleased bool, err error) {
        if len(update.Key) == 0 {
                return false, 0, nil, false, emptyKeyError()
        }
        if len(update.EndKey) > 0 {
                return false, 0, nil, false, errors.Errorf("can't resolve range intent as point intent")
        }
        if opts.TargetBytes < 0 {
                return false, 0, &roachpb.Span{Key: update.Key}, false, nil
        }

        // Production code will use a buffered writer, which makes the numBytes
        // calculation accurate. Note that an inaccurate numBytes (e.g. 0 in the
        // case of an unbuffered writer) does not affect any safety properties of
        // the database.
        beforeBytes := rw.BufferedSize()

        // Iterate over all locks held by update.Txn on this key.
        ltIter, err := NewLockTableIterator(ctx, rw, LockTableIteratorOptions{
                Prefix:       true,
                MatchTxnID:   update.Txn.ID,
                ReadCategory: fs.IntentResolutionReadCategory,
        })
        if err != nil {
                return false, 0, nil, false, err
        }
        defer ltIter.Close()
        buf := newPutBuffer()
        defer buf.release()

        var ltSeekKey EngineKey
        ltSeekKey, buf.ltKeyBuf = LockTableKey{
                Key: update.Key,
                // lock.Intent is the first locking strength in the lock-table. As a
                // minor performance optimization, we seek to this version and iterate
                // instead of iterating from the beginning of the version prefix (i.e.
                // keys.LockTableSingleKey(update.Key)). This can seek past half of the
                // LSM tombstones on this key in cases like those described in d1c91e0e
                // where intents are repeatedly written and removed on a specific key so
                // an intent is surrounded by a large number of tombstones during its
                // resolution.
                //
                // This isn't a full solution to this problem, because we still end up
                // iterating through the other half of the LSM tombstones while checking
                // for Exclusive and Shared locks. For a full solution, we need to track
                // the locking strengths that we intend to resolve on the client so that
                // we can seek to just those versions.
                //
                // We could also seek to all three versions (Intent, Exclusive, Shared)
                // with a limit, but that would require 3 seeks in all cases instead of
                // a single seek and step in cases where only an intent is present. We
                // chose not to pessimize the common case to optimize the uncommon case.
                Strength: lock.Intent,
                TxnUUID:  update.Txn.ID,
        }.ToEngineKey(buf.ltKeyBuf)

        for valid, err := ltIter.SeekEngineKeyGE(ltSeekKey); ; valid, err = ltIter.NextEngineKey() {
                if err != nil {
                        return false, 0, nil, false, errors.Wrap(err, "seeking lock table")
                } else if !valid {
                        break
                }
                str, txnID, err := ltIter.LockTableKeyVersion()
                if err != nil {
                        return false, 0, nil, false, errors.Wrap(err, "decoding lock table key version")
                }
                if txnID != update.Txn.ID {
                        return false, 0, nil, false, errors.AssertionFailedf(
                                "unexpected txnID %v != %v while scanning lock table", txnID, update.Txn.ID)
                }
                if err := ltIter.ValueProto(&buf.meta); err != nil {
                        return false, 0, nil, false, errors.Wrap(err, "unmarshaling lock table value")
                }
                var outcome lockResolutionOutcome
                if str == lock.Intent {
                        // Intent resolution requires an MVCC iterator to look up the MVCC
                        // version associated with the intent. Create one.
                        var iter MVCCIterator
                        iter, err = rw.NewMVCCIterator(ctx, MVCCKeyIterKind, IterOptions{
                                Prefix:       true,
                                KeyTypes:     IterKeyTypePointsAndRanges,
                                ReadCategory: fs.IntentResolutionReadCategory,
                        })
                        if err != nil {
                                return false, 0, nil, false, err
                        }
                        outcome, err = mvccResolveWriteIntent(ctx, rw, iter, ms, update, &buf.meta, buf)
                        iter.Close()
                } else {
                        outcome, err = mvccReleaseLockInternal(ctx, rw, ms, update, str, &buf.meta, buf)
                        replLocksReleased = replLocksReleased || outcome != lockNoop
                }
                if err != nil {
                        return false, 0, nil, false, err
                }
                ok = ok || outcome != lockNoop
        }
        numBytes = int64(rw.BufferedSize() - beforeBytes)
        return ok, numBytes, nil, replLocksReleased, nil
}

// With the separated lock table, we are employing a performance optimization:
// when a lock's metadata value is removed, we preferably want to do so using a
// SingleDel (as opposed to a Del). This is only safe if the previous operations
// on the metadata key allow it. Due to practical limitations, at the time of
// writing the condition we need is that the pebble history of the key consists
// of a single SET. (#69891 tracks an improvement, to also make SingleDel safe
// in the case of `<anything>; (DEL or legitimate SingleDel); SET; SingleDel`,
// which will open up further optimizations).
//
// It is difficult to track the history of engine writes to a key precisely, in
// particular when values are ever aborted. So we apply the optimization only to
// the main case in which it is useful, namely that of a transaction committing
// its lock that it never re-wrote in the initial epoch (i.e. no chance of it
// ever being removed before as part of being pushed). Note that when a txn
// refreshes, it stays in the original epoch, and the intents are moved, which
// does *not* cause a write to the MVCC metadata key (for which the history has
// to remain a single SET). So transactions that "only" refresh are covered by
// the optimization as well.
//
// Note that a transaction can "partially abort" and still commit due to nested
// SAVEPOINTs, such as in the below example:
//
//        BEGIN;
//          SAVEPOINT foo;
//            INSERT INTO kv VALUES(1, 1);
//          ROLLBACK TO SAVEPOINT foo;
//          INSERT INTO kv VALUES(1, 2);
//        COMMIT;
//
// This would first remove the lock (1,1) during the ROLLBACK using a Del (the
// anomaly below would occur the same if a SingleDel were used here), and thus
// without an additional condition the INSERT (1,2) would be eligible for
// committing via a SingleDel. This has to be avoided as well, since the
// metadata key for k=1 has the following history:
//
// - Set // when (1,1) is written
// - Del // during ROLLBACK
// - Set // when (1,2) is written
// - SingleDel // on COMMIT
//
// However, this sequence could compact as follows (at the time of writing, bound
// to change with #69891):
//
//   - Set (Del Set') SingleDel
//     ↓
//   - Set   Set'     SingleDel
//   - Set  (Set'     SingleDel)
//     ↓
//   - Set
//
// which means that a previously deleted lock metadata would erroneously
// become visible again. So on top of restricting SingleDel to the COMMIT case,
// we also restrict it to the case of having no ignored sequence number ranges
// (i.e. no nested txn was rolled back before the commit).
//
// For a deeper discussion of these correctness problems (avoided using the
// scoping down in this helper), see:
//
// https://github.com/cockroachdb/cockroach/issues/69891
type singleDelOptimizationHelper struct {
        // Internal state, don't access this, use the getters instead
        // (that's what the _ prefix is trying to communicate).
        _didNotUpdateMeta *bool
        _hasIgnoredSeqs   bool
        _epoch            enginepb.TxnEpoch
}

// v is the inferred value of the TxnDidNotUpdateMeta field.
func (h singleDelOptimizationHelper) v() bool {
        if h._didNotUpdateMeta == nil {
                return false
        }
        return *h._didNotUpdateMeta
}

// onCommitLock returns true if the SingleDel optimization is available
// for committing a lock/intent.
func (h singleDelOptimizationHelper) onCommitLock() bool {
        // We're committing the lock at epoch zero, the meta tracking says we didn't
        // rewrite the lock, and we also didn't previously remove the metadata for
        // this key as part of a voluntary rollback of a nested txn. So we are safe to
        // use a SingleDel here.
        return h.v() && !h._hasIgnoredSeqs && h._epoch == 0
}

// onAbortLock returns true if the SingleDel optimization is available
// for removing a lock/intent. It is always false.
// Note that "removing a lock" can occur if we know that the epoch
// changed, or when a savepoint is rolled back. It does not imply that
// the transaction aborted.
func (h singleDelOptimizationHelper) onAbortLock() bool {
        return false
}

type lockResolutionOutcome int8

const (
        lockNoop lockResolutionOutcome = iota
        lockOverwritten
        lockClearedBySingleDelete
        lockClearedByDelete
)

// mvccResolveWriteIntent is the core logic for resolving an intent. The
// function accepts instructions for how to resolve the intent (encoded in the
// LockUpdate), and the current value of the intent (meta). Returns how the
// provided intent was resolved (a no-op, rewriting the intent, writing a
// SingleDelete key, or writing a Delete key).
//
// REQUIRES: update and meta refer to the same intent on the same key.
// REQUIRES: iter surfaces range keys via IterKeyTypePointsAndRanges.
func mvccResolveWriteIntent(
        ctx context.Context,
        writer Writer,
        iter MVCCIterator,
        ms *enginepb.MVCCStats,
        update roachpb.LockUpdate,
        meta *enginepb.MVCCMetadata,
        buf *putBuffer,
) (outcome lockResolutionOutcome, err error) {
        if meta.Txn == nil || meta.Txn.ID != update.Txn.ID {
                return lockNoop, errors.Errorf("txn does not match: %v != %v", meta.Txn, update.Txn)
        }

        metaKey := MakeMVCCMetadataKey(update.Key)
        origMetaKeySize := int64(metaKey.EncodedSize())
        origMetaValSize := int64(meta.Size())
        metaTimestamp := meta.Timestamp.ToTimestamp()
        canSingleDelHelper := singleDelOptimizationHelper{
                _didNotUpdateMeta: meta.TxnDidNotUpdateMeta,
                _hasIgnoredSeqs:   len(update.IgnoredSeqNums) > 0,
                // NB: the value is only used if epochs match, so it doesn't
                // matter if we use the one from meta or incoming request here.
                _epoch: update.Txn.Epoch,
        }

        // An update with a newer epoch than the intent effectively means that we
        // wrote this intent before an earlier retry, but didn't write it again
        // after. We treat such intents as uncommitted.
        //
        // An update with a newer timestamp than the intent means that our timestamp
        // was pushed during the course of an epoch. We treat such intents as
        // committed after moving their timestamp forward. This is possible if a
        // transaction writes an intent and then successfully refreshes its
        // timestamp to avoid a restart.
        //
        // An update with an older epoch than the intent should never happen because
        // epoch increments require client action. This means that they can't be
        // caused by replays.
        //
        // An update with an older timestamp than the intent should not happen under
        // normal circumstances because a client should never bump its timestamp
        // after issuing an EndTxn request. Replays of intent writes that are pushed
        // forward due to WriteTooOld errors without client action combined with
        // replays of intent resolution make this configuration a possibility. We
        // treat such intents as uncommitted.
        epochsMatch := meta.Txn.Epoch == update.Txn.Epoch
        timestampsValid := metaTimestamp.LessEq(update.Txn.WriteTimestamp)
        timestampChanged := metaTimestamp.Less(update.Txn.WriteTimestamp)
        commit := update.Status == roachpb.COMMITTED && epochsMatch && timestampsValid

        // Note the small difference to commit epoch handling here: We allow
        // a push from a previous epoch to move a newer intent. That's not
        // necessary, but useful for allowing pushers to make forward
        // progress. Consider the following, where B reads at a timestamp
        // that's higher than any write by A in the following diagram:
        //
        // | client A@epo | B (pusher) |
        // =============================
        // | write@1      |            |
        // |              | read       |
        // |              | push       |
        // | restart      |            |
        // | write@2      |            |
        // |              | resolve@1  |
        // =============================
        //
        // In this case, if we required the epochs to match, we would not push the
        // intent forward, and client B would upon retrying after its successful
        // push and apparent resolution run into the new version of an intent again
        // (which is at a higher timestamp due to the restart, but not out of the
        // way of A). It would then actually succeed on the second iteration (since
        // the new Epoch propagates to the Push and via that, to the Pushee txn
        // used for resolving), but that costs latency.
        // TODO(tschottdorf): various epoch-related scenarios here deserve more
        // testing.
        inProgress := !update.Status.IsFinalized() && meta.Txn.Epoch >= update.Txn.Epoch
        pushed := inProgress && timestampChanged
        latestKey := MVCCKey{Key: update.Key, Timestamp: metaTimestamp}

        // Handle partial txn rollbacks. If the current txn sequence
        // is part of a rolled back (ignored) seqnum range, we're going
        // to erase that MVCC write and reveal the previous value.
        // If _all_ the writes get removed in this way, the intent
        // can be considered empty and marked for removal (removeIntent = true).
        // If only part of the intent history was rolled back, but the intent still
        // remains, the rolledBackVal is set to a non-nil value.
        var rolledBackVal *MVCCValue
        buf.newMeta = *meta
        newMeta := &buf.newMeta
        // Update the MVCC history only if:
        // 1. There are IgnoredSeqNums present.
        // 2. The update is not going to abort the intent; otherwise, the entire
        //    history will be removed anyway.
        // 3. The epochs of the intent and the update match; otherwise the epochs may
        //    have different seq nums (and ignored seq nums).
        if len(update.IgnoredSeqNums) > 0 && (commit || inProgress) && epochsMatch {
                // NOTE: mvccMaybeRewriteIntentHistory mutates its meta argument.
                // TODO(nvanbenschoten): this is an awkward interface. We shouldn't
                // be mutating meta and we shouldn't be restoring the previous value
                // here. Instead, this should all be handled down below.
                var removeIntent bool
                // Instead of modifying meta, pass a copy of it (newMeta), which will be the
                // starting point for the updated metadata. It's important to keep meta
                // intact and corresponding to the stats in ms to ensure that later on (in
                // updateStatsOnResolve) the stats will be updated correctly based on the
                // old meta (meta) and the new meta (newMeta).
                removeIntent, rolledBackVal, err = mvccMaybeRewriteIntentHistory(ctx, writer, update.IgnoredSeqNums, newMeta, latestKey)
                if err != nil {
                        return lockNoop, err
                }

                if removeIntent {
                        // This intent should be cleared. Set commit, pushed, and inProgress to
                        // false so that this intent isn't updated, gets cleared, and committed
                        // values are left untouched. Also ensure that rolledBackVal is set to nil
                        // or we could end up trying to update the intent instead of removing it.
                        commit = false
                        pushed = false
                        inProgress = false
                        rolledBackVal = nil
                }

                if rolledBackVal != nil {
                        // If we need to update the intent to roll back part of its intent
                        // history, make sure that we don't regress its timestamp, even if the
                        // caller provided an outdated timestamp.
                        update.Txn.WriteTimestamp.Forward(metaTimestamp)
                }
        }

        // There's nothing to do if meta's epoch is greater than or equal txn's
        // epoch and the state is still in progress but the intent was not pushed
        // to a larger timestamp, and if the rollback code did not modify or mark
        // the intent for removal.
        if inProgress && !pushed && rolledBackVal == nil {
                return lockNoop, nil
        }

        // If we're committing, or if the commit timestamp of the intent has been moved forward, and if
        // the proposed epoch matches the existing epoch: update the meta.Txn. For commit, it's set to
        // nil; otherwise, we update its value. We may have to update the actual version value (remove old
        // and create new with proper timestamp-encoded key) if timestamp changed.
        //
        // If the intent has disappeared in mvccMaybeRewriteIntentHistory, we skip
        // this block and fall down to the intent/value deletion code path. This
        // is because removeIntent implies rolledBackVal == nil, pushed == false, and
        // commit == false.
        if commit || pushed || rolledBackVal != nil {
                // The intent might be committing at a higher timestamp, or it might be
                // getting pushed.
                newTimestamp := update.Txn.WriteTimestamp

                // Assert that the intent timestamp never regresses. The logic above should
                // not allow this, regardless of the input to this function.
                if newTimestamp.Less(metaTimestamp) {
                        return lockNoop, errors.AssertionFailedf("timestamp regression (%s -> %s) "+
                                "during intent resolution, commit=%t pushed=%t rolledBackVal=%t",
                                metaTimestamp, newTimestamp, commit, pushed, rolledBackVal != nil)
                }

                // If we're moving the intent's timestamp, rewrite it and adjust stats.
                var prevIsValue bool
                var prevValSize int64
                if timestampChanged {
                        oldKey := latestKey
                        newKey := oldKey
                        newKey.Timestamp = newTimestamp

                        // Rewrite the versioned value at the new timestamp.
                        iter.SeekGE(oldKey)
                        valid, err := iter.Valid()
                        if err != nil {
                                return lockNoop, err
                        } else if valid {
                                if hasPoint, hasRange := iter.HasPointAndRange(); hasRange && !hasPoint {
                                        // If the seek lands on a bare range key, attempt to step to a point.
                                        iter.Next()
                                        if valid, err = iter.Valid(); err != nil {
                                                return lockNoop, err
                                        } else if valid {
                                                valid, _ = iter.HasPointAndRange()
                                        }
                                }
                        }
                        if !valid || !iter.UnsafeKey().Equal(oldKey) {
                                return lockNoop, errors.Errorf("existing intent value missing: %s", oldKey)
                        }
                        v, err := iter.UnsafeValue()
                        if err != nil {
                                return lockNoop, err
                        }
                        oldValue, err := DecodeMVCCValue(v)
                        if err != nil {
                                return lockNoop, err
                        }
                        // Special case: If mvccMaybeRewriteIntentHistory rolled back to a value
                        // in the intent history and wrote that at oldKey, iter would not be able
                        // to "see" the value since it was created before that value was written
                        // to the engine. In this case, reuse the value returned by
                        // mvccMaybeRewriteIntentHistory.
                        if rolledBackVal != nil {
                                oldValue = *rolledBackVal
                        }

                        // The local timestamp does not change during intent resolution unless the
                        // resolver provides a clock observation from this node that was captured
                        // while the transaction was still pending, in which case it can be advanced
                        // to the observed timestamp.
                        newValue := oldValue
                        newValue.LocalTimestamp = oldValue.GetLocalTimestamp(oldKey.Timestamp)
                        newValue.LocalTimestamp.Forward(update.ClockWhilePending.Timestamp)
                        if !newValue.LocalTimestampNeeded(newKey.Timestamp) || !writer.ShouldWriteLocalTimestamps(ctx) {
                                newValue.LocalTimestamp = hlc.ClockTimestamp{}
                        }

                        // Update the MVCC metadata with the timestamp for the upcoming write (or
                        // at least the stats update).
                        newMeta.Txn.WriteTimestamp = newTimestamp
                        newMeta.Timestamp = newTimestamp.ToLegacyTimestamp()
                        newMeta.KeyBytes = MVCCVersionTimestampSize
                        newMeta.ValBytes = int64(encodedMVCCValueSize(newValue))
                        newMeta.Deleted = newValue.IsTombstone()

                        if err = writer.PutMVCC(newKey, newValue); err != nil {
                                return lockNoop, err
                        }
                        if err = writer.ClearMVCC(oldKey, ClearOptions{
                                ValueSizeKnown: true,
                                ValueSize:      uint32(len(v)),
                        }); err != nil {
                                return lockNoop, err
                        }

                        // If there is a value under the intent as it moves timestamps, then
                        // that value may need an adjustment of its GCBytesAge. This is
                        // because it became non-live at orig.Timestamp originally, and now
                        // only becomes non-live at newMeta.Timestamp. For that reason, we
                        // have to read that version's size.
                        //
                        // Look for the first real versioned key, i.e. the key just below
                        // the (old) meta's timestamp, and for any MVCC range tombstones.
                        iter.Next()
                        if valid, err := iter.Valid(); err != nil {
                                return lockNoop, err
                        } else if valid {
                                if hasPoint, hasRange := iter.HasPointAndRange(); hasPoint {
                                        if unsafeKey := iter.UnsafeKey(); unsafeKey.Key.Equal(oldKey.Key) {
                                                if !hasRange || iter.RangeKeys().Versions[0].Timestamp.Less(unsafeKey.Timestamp) {
                                                        prevValLen, prevValIsTombstone, err := iter.MVCCValueLenAndIsTombstone()
                                                        if err != nil {
                                                                return lockNoop, err
                                                        }
                                                        prevIsValue = !prevValIsTombstone
                                                        prevValSize = int64(prevValLen)
                                                }
                                        }
                                }
                        }
                }

                // Update or remove the metadata key.
                var metaKeySize, metaValSize int64
                var logicalOp MVCCLogicalOpType
                if !commit {
                        // Keep existing intent if we're updating it. We update the existing
                        // metadata's timestamp instead of using the supplied intent meta to avoid
                        // overwriting a newer epoch (see comments above). The pusher's job isn't
                        // to do anything to update the intent but to move the timestamp forward,
                        // even if it can.
                        outcome = lockOverwritten
                        metaKeySize, metaValSize, err = buf.putLockMeta(
                                writer, metaKey.Key, lock.Intent, newMeta, true /* alreadyExists */)
                        logicalOp = MVCCUpdateIntentOpType
                } else {
                        outcome = lockClearedByDelete
                        useSingleDelete := canSingleDelHelper.onCommitLock()
                        if useSingleDelete {
                                outcome = lockClearedBySingleDelete
                        }
                        metaKeySize, metaValSize, err = buf.clearLockMeta(
                                writer, metaKey.Key, lock.Intent, useSingleDelete, meta.Txn.ID, ClearOptions{
                                        ValueSizeKnown: true,
                                        ValueSize:      uint32(origMetaValSize),
                                })
                        logicalOp = MVCCCommitIntentOpType
                }
                if err != nil {
                        return lockNoop, err
                }

                // Update stat counters related to resolving the intent.
                if ms != nil {
                        ms.Add(updateStatsOnResolve(update.Key, prevIsValue, prevValSize, origMetaKeySize, origMetaValSize,
                                metaKeySize, metaValSize, meta, newMeta, commit))
                }

                // Log the logical MVCC operation.
                writer.LogLogicalOp(logicalOp, MVCCLogicalOpDetails{
                        Txn:       update.Txn,
                        Key:       update.Key,
                        Timestamp: update.Txn.WriteTimestamp,
                })
                // outcome is set up above.
                return outcome, nil
        }

        // Otherwise, we're deleting the intent, which includes deleting the
        // MVCCMetadata.
        //
        // Note that we have to support a somewhat unintuitive case - an ABORT with
        // update.Txn.Epoch < meta.Txn.Epoch:
        // - writer1 writes key0 at epoch 0
        // - writer2 with higher priority encounters intent at key0 (epoch 0)
        // - writer1 restarts, now at epoch one (txn record not updated)
        // - writer1 writes key0 at epoch 1
        // - writer2 dispatches ResolveIntent to key0 (with epoch 0)
        // - ResolveIntent with epoch 0 aborts intent from epoch 1.

        // First clear the provisional value.
        if err := writer.ClearMVCC(latestKey, ClearOptions{
                ValueSizeKnown: true,
                ValueSize:      uint32(meta.ValBytes),
        }); err != nil {
                return lockNoop, err
        }

        // Log the logical MVCC operation.
        writer.LogLogicalOp(MVCCAbortIntentOpType, MVCCLogicalOpDetails{
                Txn: update.Txn,
                Key: update.Key,
        })

        ok := false

        // These variables containing the next key-value information are initialized
        // in the following if-block when ok is set to true. These are only read
        // after the if-block when ok is true (i.e., they were initialized).
        var unsafeNextKey MVCCKey
        var nextValueLen int
        var nextValueIsTombstone bool
        if nextKey := latestKey.Next(); nextKey.IsValue() {
                // The latestKey was not the smallest possible timestamp {WallTime: 0,
                // Logical: 1}. Practically, this is the only case that will occur in
                // production.
                var hasPoint, hasRange bool
                iter.SeekGE(nextKey)
                if ok, err = iter.Valid(); err != nil {
                        return lockNoop, err
                } else if ok {
                        // If the seek lands on a bare range key, attempt to step to a point.
                        if hasPoint, hasRange = iter.HasPointAndRange(); hasRange && !hasPoint {
                                iter.Next()
                                if ok, err = iter.Valid(); err != nil {
                                        return lockNoop, err
                                } else if ok {
                                        hasPoint, hasRange = iter.HasPointAndRange()
                                        ok = hasPoint
                                }
                        }
                }
                if ok = ok && iter.UnsafeKey().Key.Equal(latestKey.Key); ok {
                        unsafeNextKey = iter.UnsafeKey()
                        if !unsafeNextKey.IsValue() {
                                // Should never see an intent for this key since we seeked to a
                                // particular timestamp.
                                return lockNoop, errors.Errorf("expected an MVCC value key: %s", unsafeNextKey)
                        }
                        nextValueLen, nextValueIsTombstone, err = iter.MVCCValueLenAndIsTombstone()
                        if err != nil {
                                return lockNoop, err
                        }
                        // If a non-tombstone point key is covered by a range tombstone, then
                        // synthesize a point tombstone at the lowest range tombstone covering it.
                        // This is where the point key ceases to exist, contributing to GCBytesAge.
                        if !nextValueIsTombstone && hasRange {
                                if v, found := iter.RangeKeys().FirstAtOrAbove(unsafeNextKey.Timestamp); found {
                                        unsafeNextKey.Timestamp = v.Timestamp
                                        nextValueIsTombstone = true
                                        nextValueLen = 0
                                }
                        }
                }
                iter = nil // prevent accidental use below
        }
        // Else stepped to next key, so !ok

        if !ok {
                // If there is no other version, we should just clean up the key entirely.
                outcome = lockClearedByDelete
                useSingleDelete := canSingleDelHelper.onAbortLock()
                if useSingleDelete {
                        outcome = lockClearedBySingleDelete
                }
                _, _, err := buf.clearLockMeta(
                        writer, metaKey.Key, lock.Intent, useSingleDelete, meta.Txn.ID, ClearOptions{
                                ValueSizeKnown: true,
                                ValueSize:      uint32(origMetaValSize),
                        })
                if err != nil {
                        return lockNoop, err
                }
                // Clear stat counters attributable to the intent we're aborting.
                if ms != nil {
                        ms.Add(updateStatsOnClear(
                                update.Key, origMetaKeySize, origMetaValSize, 0, 0, meta, nil, 0))
                }
                // outcome is set above before the clearLockMeta call.
                return outcome, nil
        }

        // Update the keyMetadata with the next version.
        buf.newMeta = enginepb.MVCCMetadata{
                Deleted:  nextValueIsTombstone,
                KeyBytes: MVCCVersionTimestampSize,
                ValBytes: int64(nextValueLen),
        }
        outcome = lockClearedByDelete
        useSingleDelete := canSingleDelHelper.onAbortLock()
        if useSingleDelete {
                outcome = lockClearedBySingleDelete
        }
        metaKeySize, metaValSize, err := buf.clearLockMeta(
                writer, metaKey.Key, lock.Intent, useSingleDelete, meta.Txn.ID, ClearOptions{
                        ValueSizeKnown: true,
                        ValueSize:      uint32(origMetaValSize),
                })
        if err != nil {
                return lockNoop, err
        }

        // Update stat counters with older version.
        if ms != nil {
                ms.Add(updateStatsOnClear(update.Key, origMetaKeySize, origMetaValSize, metaKeySize,
                        metaValSize, meta, &buf.newMeta, unsafeNextKey.Timestamp.WallTime))
        }
        // outcome is set above before the clearLockMeta call.
        return outcome, nil
}

// mvccMaybeRewriteIntentHistory rewrites the intent to reveal the latest
// stored value, ignoring all values from the history that have an
// ignored seqnum.
// The remove return value, when true, indicates that
// all the writes in the intent are ignored and the intent should
// be marked for removal as it does not exist any more.
// The updatedVal, when non-nil, indicates that the intent was updated
// and should be overwritten in engine.
func mvccMaybeRewriteIntentHistory(
        ctx context.Context,
        writer Writer,
        ignoredSeqNums []enginepb.IgnoredSeqNumRange,
        meta *enginepb.MVCCMetadata,
        latestKey MVCCKey,
) (remove bool, updatedVal *MVCCValue, err error) {
        if !enginepb.TxnSeqIsIgnored(meta.Txn.Sequence, ignoredSeqNums) {
                // The latest write was not ignored. Nothing to do here.  We'll
                // proceed with the intent as usual.
                return false, nil, nil
        }
        // Find the latest historical write before that that was not
        // ignored.
        var i int
        for i = len(meta.IntentHistory) - 1; i >= 0; i-- {
                e := &meta.IntentHistory[i]
                if !enginepb.TxnSeqIsIgnored(e.Sequence, ignoredSeqNums) {
                        break
                }
        }

        // If i < 0, we don't have an intent any more: everything
        // has been rolled back.
        if i < 0 {
                return true, nil, nil
        }

        // Otherwise, we place back the write at that history entry
        // back into the intent.
        restoredValRaw := meta.IntentHistory[i].Value
        restoredVal, err := DecodeMVCCValue(restoredValRaw)
        if err != nil {
                return false, nil, err
        }
        meta.Txn.Sequence = meta.IntentHistory[i].Sequence
        meta.IntentHistory = meta.IntentHistory[:i]
        meta.Deleted = restoredVal.IsTombstone()
        meta.ValBytes = int64(len(restoredValRaw))
        // And also overwrite whatever was there in storage.
        err = writer.PutMVCC(latestKey, restoredVal)

        return false, &restoredVal, err
}

// MVCCResolveWriteIntentRange commits or aborts (rolls back) the range of write
// intents specified by start and end keys for a given txn.
// ResolveWriteIntentRange will skip write intents of other txns.
//
// An opts.MaxKeys of zero means unbounded. An opts.MaxKeys of < 0 means
// resolve nothing and returns the entire intent span as the resume span. An
// opts.TargetBytes of 0 means no byte limit. An opts.TargetBytes of < 0 means
// resolve nothing and returns the entire intent span as the resume span. If
// opts.TargetBytes > 0, then resolve intents in the range until the number of
// bytes added to the write batch by intent resolution exceeds
// opts.TargetBytes.
//
// Returns the number of intents resolved, number of bytes added to the write
// batch by intent resolution, the resume span if the max keys or bytes limit
// was exceeded, and the resume reason. Additionally, if any replicated locks
// with strength lock.Shared or lock.Exclusive are released, a boolean
// indicating as such is also returned.
func MVCCResolveWriteIntentRange(
        ctx context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        update roachpb.LockUpdate,
        opts MVCCResolveWriteIntentRangeOptions,
) (
        numKeys, numBytes int64,
        resumeSpan *roachpb.Span,
        resumeReason kvpb.ResumeReason,
        replLocksReleased bool,
        err error,
) {
        keysExceeded := opts.MaxKeys < 0
        bytesExceeded := opts.TargetBytes < 0
        if keysExceeded || bytesExceeded {
                resumeSpan := update.Span // don't inline or `update` would escape to heap
                if keysExceeded {
                        resumeReason = kvpb.RESUME_KEY_LIMIT
                } else if bytesExceeded {
                        resumeReason = kvpb.RESUME_BYTE_LIMIT
                }
                return 0, 0, &resumeSpan, resumeReason, false, nil
        }

        ltStart, _ := keys.LockTableSingleKey(update.Key, nil)
        ltEnd, _ := keys.LockTableSingleKey(update.EndKey, nil)
        ltIter, err := NewLockTableIterator(ctx, rw, LockTableIteratorOptions{
                LowerBound:   ltStart,
                UpperBound:   ltEnd,
                MatchTxnID:   update.Txn.ID,
                ReadCategory: fs.IntentResolutionReadCategory,
        })
        if err != nil {
                return 0, 0, nil, 0, false, err
        }
        defer ltIter.Close()
        var mvccIter MVCCIterator
        iterOpts := IterOptions{
                KeyTypes:     IterKeyTypePointsAndRanges,
                LowerBound:   update.Key,
                UpperBound:   update.EndKey,
                ReadCategory: fs.IntentResolutionReadCategory,
        }
        if rw.ConsistentIterators() {
                // Production code should always have consistent iterators.
                mvccIter, err = rw.NewMVCCIterator(ctx, MVCCKeyIterKind, iterOpts)
                if err != nil {
                        return 0, 0, nil, 0, false, err
                }
        } else {
                // For correctness, we need mvccIter to be consistent with engineIter.
                mvccIter = newPebbleIteratorByCloning(ctx, ltIter.CloneContext(), iterOpts, StandardDurability)
        }
        defer mvccIter.Close()
        buf := newPutBuffer()
        defer buf.release()

        intentEndKey := update.EndKey
        update.EndKey = nil

        var lastResolvedKey roachpb.Key
        var lastResolvedKeyOk bool
        for valid, err := ltIter.SeekEngineKeyGE(EngineKey{Key: ltStart}); ; valid, err = ltIter.NextEngineKey() {
                if err != nil {
                        return 0, 0, nil, 0, false, errors.Wrap(err, "seeking lock table")
                } else if !valid {
                        // No more intents in the given range.
                        break
                }

                ltEngineKey, err := ltIter.EngineKey()
                if err != nil {
                        return 0, 0, nil, 0, false, errors.Wrap(err, "retrieving lock table key")
                }
                ltKey, err := ltEngineKey.ToLockTableKey()
                if err != nil {
                        return 0, 0, nil, 0, false, errors.Wrap(err, "decoding lock table key")
                }
                sameLockedKey := lastResolvedKey.Equal(ltKey.Key)
                if !sameLockedKey {
                        // If this is not the same locked key as the last iteration, check
                        // whether we've exceeded the max keys or bytes limit. We don't check in
                        // between locks with different strengths on the same key because we
                        // can't encode a resume span that would be correct in that case. A
                        // transaction can only hold up to 3 locks on any given key, so this
                        // will never lead to us significantly overshooting the TargetBytes
                        // limit. We also only count each unique locked key once towards the
                        // MaxKeys limit, so this will never lead to us overshooting the MaxKeys
                        // limit at all.
                        keysExceeded = opts.MaxKeys > 0 && numKeys == opts.MaxKeys
                        bytesExceeded = opts.TargetBytes > 0 && numBytes >= opts.TargetBytes
                        if keysExceeded || bytesExceeded {
                                if keysExceeded {
                                        resumeReason = kvpb.RESUME_KEY_LIMIT
                                } else if bytesExceeded {
                                        resumeReason = kvpb.RESUME_BYTE_LIMIT
                                }
                                // We could also compute a tighter nextKey here if we wanted to.
                                resumeSpan := &roachpb.Span{Key: lastResolvedKey.Next(), EndKey: intentEndKey}
                                return numKeys, numBytes, resumeSpan, resumeReason, replLocksReleased, nil
                        }

                        // Copy the underlying bytes of the unsafe key. This is needed for
                        // stability of the key to check for sameLockedKey and to construct
                        // a resume span on subsequent iteration.
                        lastResolvedKey = append(lastResolvedKey[:0], ltKey.Key...)
                        lastResolvedKeyOk = false
                }
                if ltKey.TxnUUID != update.Txn.ID {
                        return 0, 0, nil, 0, false, errors.AssertionFailedf(
                                "unexpected txnID %v != %v while scanning lock table", ltKey.TxnUUID, update.Txn.ID)
                }
                update.Key = ltKey.Key
                if err := ltIter.ValueProto(&buf.meta); err != nil {
                        return 0, 0, nil, 0, false, errors.Wrap(err, "unmarshaling lock table value")
                }
                beforeBytes := rw.BufferedSize()
                var outcome lockResolutionOutcome
                if ltKey.Strength == lock.Intent {
                        outcome, err = mvccResolveWriteIntent(ctx, rw, mvccIter, ms, update, &buf.meta, buf)
                } else {
                        outcome, err = mvccReleaseLockInternal(ctx, rw, ms, update, ltKey.Strength, &buf.meta, buf)
                        replLocksReleased = replLocksReleased || outcome != lockNoop
                }
                if err != nil {
                        log.Warningf(ctx, "failed to resolve intent for key %q: %+v", lastResolvedKey, err)
                }
                if outcome != lockNoop && !lastResolvedKeyOk {
                        // We only count the first successfully resolved lock/intent on a
                        // given key towards the returned key count and key limit.
                        lastResolvedKeyOk = true
                        numKeys++
                }
                numBytes += int64(rw.BufferedSize() - beforeBytes)
        }
        return numKeys, numBytes, nil, 0, replLocksReleased, nil
}

// MVCCCheckForAcquireLock scans the replicated lock table to determine whether
// a lock acquisition at the specified key and strength by the specified
// transaction[1] would succeed. If the lock table scan finds one or more
// existing locks on the key that conflict with the acquisition then a
// LockConflictError is returned. Otherwise, nil is returned. Unlike
// MVCCAcquireLock, this method does not actually acquire the lock (i.e. write
// to the lock table).
//
// [1] Non-transactional requests cannot acquire locks that outlive themselves,
// but they are still able to specify a locking strength and conflict with other
// transactions. Therefore, it is valid to supply a nil transaction to this
// function.
func MVCCCheckForAcquireLock(
        ctx context.Context,
        reader Reader,
        txn *roachpb.Transaction,
        str lock.Strength,
        key roachpb.Key,
        maxLockConflicts int64,
        targetLockConflictBytes int64,
) error {
        if err := validateLockAcquisitionStrength(str); err != nil {
                return err
        }
        var txnID uuid.UUID
        if txn != nil {
                txnID = txn.ID
        }
        ltScanner, err := newLockTableKeyScanner(
                ctx, reader, txnID, str, maxLockConflicts, targetLockConflictBytes, fs.BatchEvalReadCategory)
        if err != nil {
                return err
        }
        defer ltScanner.close()
        return ltScanner.scan(key)
}

// MVCCAcquireLock attempts to acquire a lock at the specified key and strength
// by the specified transaction. It first scans the replicated lock table to
// determine whether any conflicting locks are held by other transactions. If
// so, a LockConflictError is returned. Otherwise, the lock is written to the
// lock table and nil is returned.
func MVCCAcquireLock(
        ctx context.Context,
        rw ReadWriter,
        txn *enginepb.TxnMeta,
        ignoredSeqNums []enginepb.IgnoredSeqNumRange,
        str lock.Strength,
        key roachpb.Key,
        ms *enginepb.MVCCStats,
        maxLockConflicts int64,
        targetLockConflictBytes int64,
) error {
        if txn == nil {
                // Non-transactional requests cannot acquire locks that outlive their
                // lifespan; they can only check for conflicting locks using
                // MVCCCheckForAcquireLock.
                return errors.Errorf("txn must be non-nil to acquire a replicated lock")
        }
        if err := validateLockAcquisitionStrength(str); err != nil {
                return err
        }
        ltScanner, err := newLockTableKeyScanner(
                ctx, rw, txn.ID, str, maxLockConflicts, targetLockConflictBytes, fs.BatchEvalReadCategory)
        if err != nil {
                return err
        }
        defer ltScanner.close()
        err = ltScanner.scan(key)
        if err != nil {
                return err
        }

        // Iterate over the replicated lock strengths, from strongest to weakest,
        // stopping at the lock strength that we'd like to acquire. If the loop
        // terminates, rolledBack will reference the desired lock strength.
        var rolledBack bool
        for _, iterStr := range strongerOrEqualStrengths(str) {
                rolledBack = false
                foundLock := ltScanner.foundOwn(iterStr)
                if foundLock == nil {
                        // Proceed to check weaker strengths...
                        continue
                }

                if foundLock.Txn.Epoch > txn.Epoch {
                        // Acquiring at old epoch.
                        return errors.Errorf(
                                "locking request with epoch %d came after lock "+
                                        "had already been acquired at epoch %d in txn %s",
                                txn.Epoch, foundLock.Txn.Epoch, txn.ID)
                } else if foundLock.Txn.Epoch < txn.Epoch {
                        // Acquiring at new epoch.
                        rolledBack = true
                } else if foundLock.Txn.Sequence > txn.Sequence {
                        // Acquiring at same epoch and an old sequence number.
                        //
                        // If the found lock has a different strength than the acquisition then we
                        // ignore it and continue. We are likely part of a replayed batch where a
                        // later request in the batch acquired a lock with a higher strength (or
                        // performed an intent write) on the same key.
                        if iterStr != str {
                                continue
                        }
                        // If the found lock has the same strength as the acquisition then this is
                        // an unexpected case. We are likely part of a replayed batch and either:
                        // 1. the lock was reacquired at a later sequence number and the minimum
                        //    acquisition sequence number was not properly retained (bug!). See
                        //    below about why we preserve the earliest non-rolled back sequence
                        //    number for each lock strength.
                        // 2. this acquisition's sequence number was rolled back and the lock was
                        //    subsequently acquired again at a higher sequence number. In such
                        //    cases, we can return an error as the client is no longer waiting for
                        //    a response.
                        return errors.Errorf(
                                "cannot acquire lock with strength %s at seq number %d, "+
                                        "already held at higher seq number %d",
                                str.String(), txn.Sequence, foundLock.Txn.Sequence)
                } else if enginepb.TxnSeqIsIgnored(foundLock.Txn.Sequence, ignoredSeqNums) {
                        // Acquiring at same epoch and new sequence number after
                        // previous sequence number was rolled back.
                        //
                        // TODO(nvanbenschoten): If this is a stronger strength than
                        // we're trying to acquire, then it would be an option to
                        // release this lock/intent at the same time as we acquire the
                        // new, weaker lock at higher, non-rolled back sequence number.
                        // This is what we do for unreplicated locks in the lock table.
                        //
                        // We don't currently do this for replicated locks because lock
                        // acquisition may be holding weaker latches than are needed to
                        // release locks at the stronger strength. This could lead to a race
                        // where concurrent work that conflicts with the existing lock but
                        // not the latches held by this acquisition discovers the lock and
                        // reports it to the lock table. The in-memory lock table could then
                        // get out of sync with the replicated lock table.
                        if iterStr != lock.Intent {
                                rolledBack = true
                        } else {
                                // If the existing lock is an intent, additionally check the
                                // intent history to verify that all of the intent writes in
                                // the intent history are also rolled back. If not, then we
                                // can still avoid reacquisition.
                                inHistoryNotRolledBack := false
                                for _, e := range foundLock.IntentHistory {
                                        if !enginepb.TxnSeqIsIgnored(e.Sequence, ignoredSeqNums) {
                                                inHistoryNotRolledBack = true
                                                break
                                        }
                                }
                                rolledBack = !inHistoryNotRolledBack
                        }
                }

                if !rolledBack {
                        // Lock held at desired or stronger strength. No need to reacquire.
                        // This is both a performance optimization and a necessary check for
                        // correctness. If we were to reacquire the lock at a newer sequence
                        // number and clobber the existing lock with its older sequence
                        // number, our newer sequence number could then be rolled back and
                        // we would forget that the lock held at the older sequence number
                        // had been and still should be held.
                        if log.ExpensiveLogEnabled(ctx, 3) {
                                log.VEventf(ctx, 3, "skipping lock acquisition for txn %s on key %s "+
                                        "with strength %s; found existing lock with strength %s and sequence %d",
                                        txn, key, str, iterStr, foundLock.Txn.Sequence)
                        }
                        return nil
                }

                // Proceed to check weaker strengths...
        }

        // Write the lock.
        buf := newPutBuffer()
        defer buf.release()

        newMeta := &buf.newMeta
        newMeta.Txn = txn
        newMeta.Timestamp = txn.WriteTimestamp.ToLegacyTimestamp()
        keyBytes, valBytes, err := buf.putLockMeta(rw, key, str, newMeta, rolledBack)
        if err != nil {
                return err
        }

        // Update MVCC stats.
        if ms != nil {
                origMeta := ltScanner.foundOwn(str)
                var origKeySize, origValSize int64
                if origMeta != nil {
                        origKeySize = keyBytes // same key
                        origValSize = int64(origMeta.Size())
                }
                ms.Add(updateStatsOnAcquireLock(origKeySize, origValSize, keyBytes, valBytes, origMeta, newMeta))
        }

        return nil
}

func validateLockAcquisitionStrength(str lock.Strength) error {
        if !(str == lock.Shared || str == lock.Exclusive) {
                return errors.Errorf("invalid lock strength to acquire lock: %s", str.String())
        }
        return nil
}

// MVCCVerifyLock returns true if the supplied transaction holds a lock that
// offers equal to or greater protection[1] than the supplied lock strength.
//
// [1] Locks that were acquired at sequence numbers that have since been ignored
// aren't considered, as they may be rolled back in the future.
func MVCCVerifyLock(
        ctx context.Context,
        reader Reader,
        txn *enginepb.TxnMeta,
        str lock.Strength,
        key roachpb.Key,
        ignoredSeqNums []enginepb.IgnoredSeqNumRange,
) (bool, error) {
        if txn == nil {
                // Non-transactional requests cannot acquire locks that outlive their
                // lifespan. Nothing to verify.
                return false, errors.Errorf("txn must be non-nil to verify replicated lock")
        }
        if str == lock.None {
                return false, errors.Errorf("querying a lock with strength %s is nonsensical", lock.None)
        }
        // NB: Pass in lock.None when configuring the lockTableKeyScanner to only
        // return locks held by the our transaction.
        ltScanner, err := newLockTableKeyScanner(
                ctx, reader, txn.ID, lock.None, 0, 0, fs.BatchEvalReadCategory,
        )
        if err != nil {
                return false, err
        }

        defer ltScanner.close()
        err = ltScanner.scan(key)
        if err != nil {
                return false, err
        }

        for _, iterStr := range strongerOrEqualStrengths(str) {
                foundLock := ltScanner.foundOwn(iterStr)
                if foundLock == nil {
                        // Proceed to check weaker strengths...
                        continue
                }

                if foundLock.Txn.Epoch != txn.Epoch {
                        continue // the lock belongs to a different epoch
                }

                // We don't keep a full history of all sequence numbers a replicated lock
                // was acquired at. As long as there exists a lock at some (non-rolled back)
                // sequence number with sufficient lock strength, we have the desired mutual
                // exclusion guarantees. We need to make sure the lock we found was written
                // at a sequence number that hasn't been rolled back; otherwise, there's
                // nothing stopping another request from rolling back the lock even though
                // it exists right now.
                if enginepb.TxnSeqIsIgnored(foundLock.Txn.Sequence, ignoredSeqNums) {
                        if iterStr != lock.Intent {
                                // The lock is ignored. Proceed to check weaker lock strengths...
                                continue
                        }
                        // If the existing lock is an intent, additionally check the intent
                        // history to verify that all of the intent writes in the intent history
                        // are also rolled back. If not, an element in the intent history is
                        // providing the required protection.
                        //
                        // This is not just an optimization. It is necessary for the correctness
                        // of MVCCVerifyLock because MVCCAcquireLock will skip lock acquisition if
                        // it finds a non-rolled back intent in the intent history.
                        inHistoryNotRolledBack := false
                        for _, e := range foundLock.IntentHistory {
                                if !enginepb.TxnSeqIsIgnored(e.Sequence, ignoredSeqNums) {
                                        inHistoryNotRolledBack = true
                                        break
                                }
                        }
                        if !inHistoryNotRolledBack {
                                // The intent and all prior intents in the intent history are
                                // ignored, proceed to check weaker lock strengths...
                                continue
                        }
                }

                return true, nil
        }
        return false, nil
}

// mvccReleaseLockInternal releases a lock at the specified key and strength and
// by the specified transaction. The function accepts the instructions for how
// to release the lock (encoded in the LockUpdate), and the current value of the
// lock (meta).
func mvccReleaseLockInternal(
        ctx context.Context,
        writer Writer,
        ms *enginepb.MVCCStats,
        update roachpb.LockUpdate,
        str lock.Strength,
        meta *enginepb.MVCCMetadata,
        buf *putBuffer,
) (lockResolutionOutcome, error) {
        finalized := update.Status.IsFinalized()
        rolledBack := meta.Txn.Epoch < update.Txn.Epoch ||
                (meta.Txn.Epoch == update.Txn.Epoch && enginepb.TxnSeqIsIgnored(meta.Txn.Sequence, update.IgnoredSeqNums))
        release := finalized || rolledBack
        if !release {
                return lockNoop, nil
        }

        canSingleDelHelper := singleDelOptimizationHelper{
                _didNotUpdateMeta: meta.TxnDidNotUpdateMeta,
                _hasIgnoredSeqs:   len(update.IgnoredSeqNums) > 0,
                _epoch:            update.Txn.Epoch,
        }
        var txnDidNotUpdateMeta bool
        if update.Status == roachpb.COMMITTED && !rolledBack {
                txnDidNotUpdateMeta = canSingleDelHelper.onCommitLock()
        } else {
                txnDidNotUpdateMeta = canSingleDelHelper.onAbortLock()
        }

        keyBytes, _, err := buf.clearLockMeta(writer, update.Key, str, txnDidNotUpdateMeta, meta.Txn.ID, ClearOptions{
                ValueSizeKnown: true,
                ValueSize:      uint32(meta.Size()),
        })
        if err != nil {
                return lockNoop, err
        }

        // Update MVCC stats.
        if ms != nil {
                origKeySize := keyBytes // same key
                origValSize := int64(meta.Size())
                ms.Add(updateStatsOnReleaseLock(origKeySize, origValSize, meta))
        }

        if txnDidNotUpdateMeta {
                return lockClearedBySingleDelete, nil
        }
        return lockClearedByDelete, nil

}

// MVCCGarbageCollect creates an iterator on the ReadWriter. In parallel
// it iterates through the keys listed for garbage collection by the
// keys slice. The iterator is seeked in turn to each listed
// key, clearing all values with timestamps <= to expiration. The
// timestamp parameter is used to compute the intent age on GC.
//
// Note that this method will be sorting the keys.
//
// REQUIRES: the keys are either all local keys, or all global keys, and
// not a mix of the two. This is to accommodate the implementation below
// that creates an iterator with bounds that span from the first to last
// key (in sorted order).
func MVCCGarbageCollect(
        ctx context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        keys []kvpb.GCRequest_GCKey,
        timestamp hlc.Timestamp,
) (retE error) {

        var count int64
        if log.ExpensiveLogEnabled(ctx, 1) {
                defer func(begin time.Time) {
                        lk, c := len(keys), count // alloc only when neeeded
                        log.Eventf(ctx, "handled %d incoming point keys; deleted %d in %s",
                                lk, c, timeutil.Since(begin))
                        if retE != nil {
                                log.Eventf(ctx, "err: %s", retE)
                        }
                }(timeutil.Now())
        }

        // If there are no keys then there is no work.
        if len(keys) == 0 {
                return nil
        }

        // Sort the slice to both determine the bounds and ensure that we're seeking
        // in increasing order.
        sort.Slice(keys, func(i, j int) bool {
                iKey := MVCCKey{Key: keys[i].Key, Timestamp: keys[i].Timestamp}
                jKey := MVCCKey{Key: keys[j].Key, Timestamp: keys[j].Timestamp}
                return iKey.Less(jKey)
        })

        // Bound the iterator appropriately for the set of keys we'll be garbage
        // collecting.
        iter, err := rw.NewMVCCIterator(ctx, MVCCKeyAndIntentsIterKind, IterOptions{
                LowerBound:   keys[0].Key,
                UpperBound:   keys[len(keys)-1].Key.Next(),
                KeyTypes:     IterKeyTypePointsAndRanges,
                ReadCategory: fs.MVCCGCReadCategory,
        })
        if err != nil {
                return err
        }
        defer iter.Close()

        // Cached stack of range tombstones covering current point. Used to determine
        // GCBytesAge of deleted value by searching first covering range tombstone
        // above.
        var rangeTombstones MVCCRangeKeyStack

        // Iterate through specified GC keys.
        meta := &enginepb.MVCCMetadata{}
        for _, gcKey := range keys {
                encKey := MakeMVCCMetadataKey(gcKey.Key)
                // TODO(oleg): Results of this call are not obvious and logic to handle
                // stats updates for different real and synthesized metadata becomes
                // unnecessary complicated. Revisit this to make it cleaner.
                ok, metaKeySize, metaValSize, realKeyChanged, err := mvccGetMetadata(iter, encKey, meta)
                if err != nil {
                        return err
                }
                if !ok {
                        continue
                }

                // If mvccGetMetadata landed on a bare range tombstone for the key it will
                // synthesize deletion meta. We need to filter this case out to avoid
                // updating key stats as the key doesn't exist.
                // Empty realKeyChanged is an indication that there are no values for the
                // key present, but it may contain an inlined metadata which we filter out
                // by checking that it is not inlined.
                // In that case, we can skip to the next gc key as there's nothing to GC.
                // As a side effect of this change, we should be positioned on the point
                // key or inlined meta at this point and can do further checks for GC
                // eligibility.
                inlinedValue := meta.IsInline()
                if realKeyChanged.IsEmpty() && !inlinedValue {
                        continue
                }

                // We are guaranteed now to be positioned at the meta or version key that
                // belongs to gcKey history.

                unsafeKey := iter.UnsafeKey()
                implicitMeta := unsafeKey.IsValue()
                // Note that we naively can't terminate GC'ing keys loop early if we
                // enter any of branches below, as it will update the stats under the
                // provision that the (implicit or explicit) meta key (and thus all
                // versions) are being removed. We had this faulty functionality at some
                // point; it should no longer be necessary since the higher levels already
                // make sure each individual GCRequest does bounded work.
                //
                // First check for the case of range tombstone covering keys when no
                // metadata is available.
                if implicitMeta && meta.Deleted && !meta.Timestamp.Equal(unsafeKey.Timestamp) {
                        // If we have implicit deletion meta, and realKeyChanged is not the first
                        // key in history, that means it is covered by a range tombstone (which
                        // was used to synthesize meta).
                        if unsafeKey.Timestamp.LessEq(gcKey.Timestamp) {
                                // If first object in history is at or below gcKey timestamp then we
                                // have no explicit meta and all objects are subject to deletion.
                                if ms != nil {
                                        ms.Add(updateStatsOnGC(gcKey.Key, metaKeySize, metaValSize, true, /* metaKey */
                                                realKeyChanged.WallTime))
                                }
                        }
                } else if meta.Timestamp.ToTimestamp().LessEq(gcKey.Timestamp) {
                        // Then, check whether all values of the key are being deleted in the
                        // rest of the cases.
                        //
                        // For version keys, don't allow GC'ing the meta key if it's
                        // not marked deleted. However, for inline values we allow it;
                        // they are internal and GCing them directly saves the extra
                        // deletion step.
                        if !meta.Deleted && !inlinedValue {
                                return errors.Errorf("request to GC non-deleted, latest value of %q", gcKey.Key)
                        }
                        if meta.Txn != nil {
                                return errors.Errorf("request to GC intent at %q", gcKey.Key)
                        }
                        if ms != nil {
                                if inlinedValue {
                                        updateStatsForInline(ms, gcKey.Key, metaKeySize, metaValSize, 0, 0)
                                        ms.AgeTo(timestamp.WallTime)
                                } else {
                                        ms.Add(updateStatsOnGC(gcKey.Key, metaKeySize, metaValSize, true /* metaKey */, meta.Timestamp.WallTime))
                                }
                        }
                        if !implicitMeta {
                                // This must be an inline entry since we are not allowed to clear
                                // intents, and we've confirmed that meta.Txn == nil earlier.
                                if err := rw.ClearUnversioned(iter.UnsafeKey().Key, ClearOptions{
                                        ValueSizeKnown: true,
                                        ValueSize:      uint32(iter.ValueLen()),
                                }); err != nil {
                                        return err
                                }
                                count++
                        }
                }

                if !implicitMeta {
                        // The iter is pointing at an MVCCMetadata, advance to the next entry.
                        iter.Next()
                }

                // For GCBytesAge, this requires keeping track of the previous key's
                // timestamp (prevNanos). See ComputeStats for a more easily digested and
                // better commented version of this logic. The below block will set
                // prevNanos to the appropriate value and position the iterator at the first
                // garbage version.
                prevNanos := timestamp.WallTime
                {
                        // If true - forward iteration positioned iterator on first garbage
                        // (key.ts <= gc.ts).
                        var foundPrevNanos bool
                        {
                                // We'll step the iterator a few time before attempting to seek.

                                // True if we found next key while iterating. That means there's no
                                // garbage for the key.
                                var foundNextKey bool

                                // If there are a large number of versions which are not garbage,
                                // iterating through all of them is very inefficient. However, if there
                                // are few, SeekLT is inefficient. MVCCGarbageCollect will try to step
                                // the iterator a few times to find the predecessor of gcKey before
                                // resorting to seeking.
                                //
                                // In a synthetic benchmark where there is one version of garbage and
                                // one not, this optimization showed a 50% improvement. More
                                // importantly, this optimization mitigated the overhead of the Seek
                                // approach when almost all of the versions are garbage.
                                const nextsBeforeSeekLT = 4
                                for i := 0; i < nextsBeforeSeekLT; i++ {
                                        if i > 0 {
                                                iter.Next()
                                        }
                                        if ok, err := iter.Valid(); err != nil {
                                                return err
                                        } else if !ok {
                                                foundNextKey = true
                                                break
                                        }
                                        if hasPoint, _ := iter.HasPointAndRange(); !hasPoint {
                                                foundNextKey = true
                                                break
                                        }
                                        unsafeIterKey := iter.UnsafeKey()
                                        if !unsafeIterKey.Key.Equal(encKey.Key) {
                                                foundNextKey = true
                                                break
                                        }
                                        if unsafeIterKey.Timestamp.LessEq(gcKey.Timestamp) {
                                                foundPrevNanos = true
                                                break
                                        }
                                        prevNanos = unsafeIterKey.Timestamp.WallTime
                                }

                                // We have nothing to GC for this key if we found the next key.
                                if foundNextKey {
                                        continue
                                }
                        }

                        // Stepping with the iterator did not get us to our target garbage key or
                        // its predecessor. Seek to the predecessor to find the right value for
                        // prevNanos and position the iterator on the gcKey.
                        if !foundPrevNanos {
                                gcKeyMVCC := MVCCKey{Key: gcKey.Key, Timestamp: gcKey.Timestamp}
                                iter.SeekLT(gcKeyMVCC)
                                if ok, err := iter.Valid(); err != nil {
                                        return err
                                } else if ok {
                                        if hasPoint, _ := iter.HasPointAndRange(); hasPoint {
                                                // Use the previous version's timestamp if it's for this key.
                                                if iter.UnsafeKey().Key.Equal(gcKey.Key) {
                                                        prevNanos = iter.UnsafeKey().Timestamp.WallTime
                                                }
                                                // Seek to the first version for deletion.
                                                iter.Next()
                                        }
                                }
                        }
                }

                // At this point iterator is positioned on first garbage version and forward
                // iteration will give us all versions to delete up to the next key.

                if ms != nil {
                        // We need to iterate ranges only to compute GCBytesAge if we are updating
                        // stats.
                        //
                        // We can't rely on range key changed iterator functionality here as we do
                        // seek on every loop iteration to find next key to GC.
                        if _, hasRange := iter.HasPointAndRange(); hasRange {
                                if !rangeTombstones.Bounds.Key.Equal(iter.RangeBounds().Key) {
                                        iter.RangeKeys().CloneInto(&rangeTombstones)
                                }
                        } else if !rangeTombstones.IsEmpty() {
                                rangeTombstones.Clear()
                        }
                }

                // Iterate through the garbage versions, accumulating their stats and
                // issuing clear operations.
                for ; ; iter.Next() {
                        if ok, err := iter.Valid(); err != nil {
                                return err
                        } else if !ok {
                                break
                        }
                        unsafeIterKey := iter.UnsafeKey()
                        if !unsafeIterKey.Key.Equal(encKey.Key) {
                                break
                        }
                        if !unsafeIterKey.IsValue() {
                                break
                        }

                        var clearOpts ClearOptions
                        if ms != nil {
                                valLen, valIsTombstone, err := iter.MVCCValueLenAndIsTombstone()
                                if err != nil {
                                        return err
                                }
                                clearOpts.ValueSizeKnown = true
                                clearOpts.ValueSize = uint32(valLen)
                                keySize := MVCCVersionTimestampSize
                                valSize := int64(valLen)

                                // A non-deletion becomes non-live when its newer neighbor shows up.
                                // A deletion tombstone becomes non-live right when it is created.
                                fromNS := prevNanos
                                if valIsTombstone {
                                        fromNS = unsafeIterKey.Timestamp.WallTime
                                } else if !rangeTombstones.IsEmpty() {
                                        // For non deletions, we need to find if we had a range tombstone
                                        // between this and next value (prevNanos) to use its timestamp for
                                        // computing GCBytesAge.
                                        if kv, ok := rangeTombstones.FirstAtOrAbove(unsafeIterKey.Timestamp); ok {
                                                if kv.Timestamp.WallTime < fromNS {
                                                        fromNS = kv.Timestamp.WallTime
                                                }
                                        }
                                }

                                ms.Add(updateStatsOnGC(gcKey.Key, keySize, valSize, false /* metaKey */, fromNS))
                        }
                        count++
                        if err := rw.ClearMVCC(unsafeIterKey, clearOpts); err != nil {
                                return err
                        }
                        prevNanos = unsafeIterKey.Timestamp.WallTime
                }
        }

        return nil
}

// CollectableGCRangeKey is a struct containing range key as well as span
// boundaries locked for particular range key.
// Range GC needs a latch span as it needs to expand iteration beyond the
// range key itself to find adjacent ranges and those ranges should be safe to
// read.
type CollectableGCRangeKey struct {
        MVCCRangeKey
        LatchSpan roachpb.Span
}

// MVCCGarbageCollectRangeKeys is similar in functionality to MVCCGarbageCollect but
// operates on range keys. It does sanity checks that no values exist below
// range tombstones so that no values are exposed in case point values GC was
// not performed correctly by the level above.
func MVCCGarbageCollectRangeKeys(
        ctx context.Context, rw ReadWriter, ms *enginepb.MVCCStats, rks []CollectableGCRangeKey,
) (retE error) {

        var count int64
        defer func(begin time.Time) {
                log.Eventf(ctx,
                        "handled %d incoming range keys; deleted %d fragments in %s",
                        len(rks), count, timeutil.Since(begin))
                if retE != nil {
                        log.Eventf(ctx, "err: %s", retE)
                }
        }(timeutil.Now())

        if len(rks) == 0 {
                return nil
        }

        // Validate range keys are well formed.
        for _, rk := range rks {
                if err := rk.Validate(); err != nil {
                        return errors.Wrap(err, "failed to validate gc range keys in mvcc gc")
                }
        }

        sort.Slice(rks, func(i, j int) bool {
                return rks[i].Compare(rks[j].MVCCRangeKey) < 0
        })

        // Validate that keys are non-overlapping.
        for i := 1; i < len(rks); i++ {
                if rks[i].StartKey.Compare(rks[i-1].EndKey) < 0 {
                        return errors.Errorf("range keys in gc request should be non-overlapping: %s vs %s",
                                rks[i-1].String(), rks[i].String())
                }
        }

        // gcRangeKey garbage collects the given range key, using a closure to manage
        // iterator lifetimes via defer.
        gcRangeKey := func(gcKey CollectableGCRangeKey) error {

                // lhs keeps track of any range key to the left, in case they merge to the
                // right following GC.
                var lhs MVCCRangeKeyStack

                // Bound the iterator appropriately for the set of keys we'll be garbage
                // collecting. We are using latch bounds to collect info about adjacent
                // range fragments for correct MVCCStats updates.
                iter, err := rw.NewMVCCIterator(ctx, MVCCKeyIterKind, IterOptions{
                        LowerBound:   gcKey.LatchSpan.Key,
                        UpperBound:   gcKey.LatchSpan.EndKey,
                        KeyTypes:     IterKeyTypeRangesOnly,
                        ReadCategory: fs.MVCCGCReadCategory,
                })
                if err != nil {
                        return err
                }
                defer iter.Close()

                for iter.SeekGE(MVCCKey{Key: gcKey.LatchSpan.Key}); ; iter.Next() {
                        if ok, err := iter.Valid(); err != nil {
                                return err
                        } else if !ok {
                                break
                        }

                        rangeKeys := iter.RangeKeys()

                        // Check if preceding range tombstone is adjacent to GC'd one. If we
                        // started iterating too early, just skip to next key.
                        if rangeKeys.Bounds.EndKey.Compare(gcKey.StartKey) <= 0 {
                                rangeKeys.CloneInto(&lhs)
                                continue
                        }

                        // Terminate loop once we've reached a range tombstone past the right
                        // GC range key boundary, but check if we merge with it.
                        if rangeKeys.Bounds.Key.Compare(gcKey.EndKey) >= 0 {
                                if ms != nil && lhs.CanMergeRight(rangeKeys) {
                                        ms.Add(updateStatsOnRangeKeyMerge(rangeKeys.Bounds.Key, rangeKeys.Versions))
                                }
                                break
                        }

                        // If there's nothing to GC, keep moving.
                        if !rangeKeys.Oldest().LessEq(gcKey.Timestamp) {
                                // Even if we don't GC anything for this range fragment, we might have
                                // changed previous and it might become mergable as a result.
                                if ms != nil && lhs.CanMergeRight(rangeKeys) {
                                        ms.Add(updateStatsOnRangeKeyMerge(rangeKeys.Bounds.Key, rangeKeys.Versions))
                                }
                                rangeKeys.CloneInto(&lhs)
                                continue
                        }

                        // Account for any range key fragmentation due to the clears. The actual
                        // clears of the inside fragments will be accounted for later. We also
                        // truncate the bounds of the range key stack to the GC bounds, since
                        // this is the part we'll be clearing.
                        if rangeKeys.Bounds.Key.Compare(gcKey.StartKey) < 0 {
                                rangeKeys.Bounds.Key = gcKey.StartKey
                                if ms != nil {
                                        ms.Add(UpdateStatsOnRangeKeySplit(gcKey.StartKey, rangeKeys.Versions))
                                }
                        }
                        if rangeKeys.Bounds.EndKey.Compare(gcKey.EndKey) > 0 {
                                rangeKeys.Bounds.EndKey = gcKey.EndKey
                                if ms != nil {
                                        ms.Add(UpdateStatsOnRangeKeySplit(gcKey.EndKey, rangeKeys.Versions))
                                }
                        }

                        // Clear the range keys, and keep track of any remaining range keys. We
                        // do this in reverse order, so that we can shorten the slice in place
                        // while we're iterating.
                        for i := rangeKeys.Len() - 1; i >= 0; i-- {
                                v := rangeKeys.Versions[i]
                                if !v.Timestamp.LessEq(gcKey.Timestamp) {
                                        break
                                }
                                k := rangeKeys.AsRangeKey(v)
                                log.Eventf(ctx, "clearing rangekey fragment: %s", k)
                                if err := rw.ClearMVCCRangeKey(k); err != nil {
                                        return err
                                }
                                count++
                                if ms != nil {
                                        ms.Add(updateStatsOnRangeKeyClearVersion(rangeKeys, v))
                                }
                                rangeKeys.Versions = rangeKeys.Versions[:i]
                        }

                        // Check whether we're merging with the stack to our left, and record
                        // the current stack for the next iteration.
                        if ms != nil && lhs.CanMergeRight(rangeKeys) {
                                ms.Add(updateStatsOnRangeKeyMerge(rangeKeys.Bounds.Key, rangeKeys.Versions))
                        }
                        rangeKeys.CloneInto(&lhs)

                        // Verify that there are no remaining data under the deleted range using
                        // time bound iterator.
                        if err := verifyNoValuesUnderRangeKey(ctx, rw, rangeKeys.Bounds, gcKey); err != nil {
                                return err
                        }
                }
                return nil
        }

        for _, gcKey := range rks {
                if err := gcRangeKey(gcKey); err != nil {
                        return err
                }
        }

        return nil
}

func verifyNoValuesUnderRangeKey(
        ctx context.Context, reader Reader, bounds roachpb.Span, gcKey CollectableGCRangeKey,
) error {
        // Use a time bound iterator to verify there is no remaining data under the
        // deleted range.
        ptIter, err := NewMVCCIncrementalIterator(ctx, reader, MVCCIncrementalIterOptions{
                KeyTypes:     IterKeyTypePointsOnly,
                StartKey:     bounds.Key,
                EndKey:       bounds.EndKey,
                EndTime:      gcKey.Timestamp,
                IntentPolicy: MVCCIncrementalIterIntentPolicyEmit,
                ReadCategory: fs.MVCCGCReadCategory,
        })
        if err != nil {
                return err
        }
        defer ptIter.Close()

        for ptIter.SeekGE(MVCCKey{Key: bounds.Key}); ; ptIter.Next() {
                if ok, err := ptIter.Valid(); err != nil || !ok {
                        return err
                }
                // Disallow any value under the range key. We only skip intents as they
                // must have a provisional value with appropriate timestamp.
                if pointKey := ptIter.UnsafeKey(); pointKey.IsValue() {
                        return errors.Errorf("attempt to delete range tombstone %q hiding key at %q",
                                gcKey, pointKey)
                }
        }
}

// MVCCGarbageCollectWholeRange removes all the range data and resets counters.
// It only does so if data is completely covered by range keys
func MVCCGarbageCollectWholeRange(
        ctx context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        start, end roachpb.Key,
        gcThreshold hlc.Timestamp,
        rangeStats enginepb.MVCCStats,
) error {
        if rangeStats.ContainsEstimates == 0 && rangeStats.LiveCount > 0 {
                return errors.Errorf("range contains live data, can't use GC clear range")
        }
        if _, err := CanGCEntireRange(ctx, rw, start, end, gcThreshold); err != nil {
                return err
        }
        if err := rw.ClearRawRange(start, end, true, true); err != nil {
                return err
        }
        if ms != nil {
                // Reset point and range counters as we deleted the whole range.
                rangeStats.AgeTo(ms.LastUpdateNanos)
                ms.LiveCount -= rangeStats.LiveCount
                ms.LiveBytes -= rangeStats.LiveBytes
                ms.KeyCount -= rangeStats.KeyCount
                ms.KeyBytes -= rangeStats.KeyBytes
                ms.ValCount -= rangeStats.ValCount
                ms.ValBytes -= rangeStats.ValBytes
                ms.RangeKeyCount -= rangeStats.RangeKeyCount
                ms.RangeKeyBytes -= rangeStats.RangeKeyBytes
                ms.RangeValCount -= rangeStats.RangeValCount
                ms.RangeValBytes -= rangeStats.RangeValBytes
                ms.GCBytesAge -= rangeStats.GCBytesAge
                // We also zero out intents as range can't be cleared if intents are
                // present.
                // This should only be the case if stats are estimates and intent
                // information was not accurate.
                ms.IntentCount -= rangeStats.IntentCount
                ms.IntentBytes -= rangeStats.IntentBytes
                ms.LockAge -= rangeStats.LockAge
                ms.LockCount -= rangeStats.LockCount
        }
        return nil
}

// CanGCEntireRange checks if a span of keys doesn't contain any live data
// and all data is covered by range tombstones at or below provided threshold.
// This functions is meant for fast path deletion by GC where range can be
// removed by a range tombstone.
func CanGCEntireRange(
        ctx context.Context, rw Reader, start, end roachpb.Key, gcThreshold hlc.Timestamp,
) (coveredByRangeTombstones bool, err error) {
        // It makes no sense to check local ranges for fast path.
        if isLocal(start) || isLocal(end) {
                return coveredByRangeTombstones, errors.Errorf("range emptiness check can only be done on global ranges")
        }
        iter, err := rw.NewMVCCIterator(ctx, MVCCKeyAndIntentsIterKind, IterOptions{
                KeyTypes:             IterKeyTypePointsAndRanges,
                LowerBound:           start,
                UpperBound:           end,
                RangeKeyMaskingBelow: gcThreshold,
                ReadCategory:         fs.MVCCGCReadCategory,
        })
        if err != nil {
                return coveredByRangeTombstones, err
        }
        defer iter.Close()
        iter.SeekGE(MVCCKey{Key: start})
        for ; ; iter.Next() {
                if ok, err := iter.Valid(); err != nil {
                        return coveredByRangeTombstones, err
                } else if !ok {
                        break
                }
                hasPoint, hasRange := iter.HasPointAndRange()
                if hasPoint {
                        return coveredByRangeTombstones, errors.Errorf("found key not covered by range tombstone %s",
                                iter.UnsafeKey())
                }
                if hasRange {
                        coveredByRangeTombstones = true
                        newest := iter.RangeKeys().Newest()
                        if gcThreshold.Less(newest) {
                                return coveredByRangeTombstones, errors.Errorf("range tombstones above gc threshold. GC=%s, range=%s",
                                        gcThreshold.String(), newest.String())
                        }
                }
        }
        return coveredByRangeTombstones, nil
}

// MVCCGarbageCollectPointsWithClearRange removes garbage collected points data
// within range [start@startTimestamp, endTimestamp). This function performs a
// check to ensure that no non-garbage data (most recent or history with
// timestamp greater that threshold) is being deleted. Range tombstones are kept
// intact and need to be removed separately.
func MVCCGarbageCollectPointsWithClearRange(
        ctx context.Context,
        rw ReadWriter,
        ms *enginepb.MVCCStats,
        start, end roachpb.Key,
        startTimestamp hlc.Timestamp,
        gcThreshold hlc.Timestamp,
) error {
        var countKeys int64
        var removedEntries int64
        defer func(begin time.Time) {
                // TODO(oleg): this could be misleading if GC fails, but this function still
                // reports how many keys were GC'd. The approach is identical to what point
                // key GC does for consistency, but both places could be improved.
                log.Eventf(ctx,
                        "done with GC evaluation for clear range of %d keys at %.2f keys/sec. Deleted %d entries",
                        countKeys, float64(countKeys)*1e9/float64(timeutil.Since(begin)), removedEntries)
        }(timeutil.Now())

        iter, err := rw.NewMVCCIterator(ctx, MVCCKeyAndIntentsIterKind, IterOptions{
                LowerBound:   start,
                UpperBound:   end,
                KeyTypes:     IterKeyTypePointsAndRanges,
                ReadCategory: fs.MVCCGCReadCategory,
        })
        if err != nil {
                return err
        }
        defer iter.Close()

        iter.SeekGE(MVCCKey{Key: start})

        var (
                // prevPointKey is a newer version (with higher timestamp) of current key.
                // Its key component is updated when key is first seen at the beginning of
                // loop, and timestamp is reset to empty.
                // Its timestamp component is updated at the end of loop (including
                // continue).
                // It is used to check that current key is covered and eligible for GC as
                // well as mvcc stats calculations.
                prevPointKey    MVCCKey
                rangeTombstones MVCCRangeKeyStack
                firstKey        = true
        )

        for ; ; iter.Next() {
                if ok, err := iter.Valid(); err != nil {
                        return err
                } else if !ok {
                        break
                }

                if iter.RangeKeyChanged() {
                        iter.RangeKeys().CloneInto(&rangeTombstones)
                        if hasPoint, _ := iter.HasPointAndRange(); !hasPoint {
                                continue
                        }
                }

                // Invariant: we're now positioned on a point key. The iterator can only
                // be positioned on a bare range key when `RangeKeyChanged()` returns `true`.
                countKeys++
                unsafeKey := iter.UnsafeKey()
                newKey := !prevPointKey.Key.Equal(unsafeKey.Key)
                if newKey {
                        unsafeKey.CloneInto(&prevPointKey)
                        prevPointKey.Timestamp = hlc.Timestamp{}
                }

                // Skip keys that fall outside of range (only until we reach the first
                // eligible key).
                if firstKey && unsafeKey.Compare(MVCCKey{Key: start, Timestamp: startTimestamp}) < 0 {
                        prevPointKey.Timestamp = unsafeKey.Timestamp
                        continue
                }
                firstKey = false

                if unsafeKey.Timestamp.IsEmpty() {
                        // Found unresolved intent. We use .String() explicitly as it is not
                        // including  timestamps if they are zero, but Format() does.
                        return errors.Errorf("attempt to GC intent %s using clear range",
                                unsafeKey.String())
                }
                if gcThreshold.Less(unsafeKey.Timestamp) {
                        // Current version is above GC threshold so it is not safe to clear.
                        return errors.Errorf("attempt to GC data %s above threshold %s with clear range",
                                unsafeKey, gcThreshold)
                }

                valueLen, isTombstone, err := iter.MVCCValueLenAndIsTombstone()
                if err != nil {
                        return err
                }

                // Find timestamp covering current key.
                coveredBy := prevPointKey.Timestamp
                if rangeKeyCover, ok := rangeTombstones.FirstAtOrAbove(unsafeKey.Timestamp); ok {
                        // If there's a range between current value and value above
                        // use that timestamp.
                        if coveredBy.IsEmpty() || rangeKeyCover.Timestamp.Less(coveredBy) {
                                coveredBy = rangeKeyCover.Timestamp
                        }
                }

                if isGarbage := !coveredBy.IsEmpty() && coveredBy.LessEq(gcThreshold) || isTombstone; !isGarbage {
                        // Current version is below threshold and is not a tombstone, but
                        // preceding one is above so it is visible and can't be cleared.
                        return errors.Errorf("attempt to GC data %s still visible at GC threshold %s with clear range",
                                unsafeKey, gcThreshold)
                }

                validTill := coveredBy
                if isTombstone {
                        validTill = unsafeKey.Timestamp
                }

                if ms != nil {
                        if newKey {
                                ms.Add(updateStatsOnGC(unsafeKey.Key, int64(EncodedMVCCKeyPrefixLength(unsafeKey.Key)), 0,
                                        true /* metaKey */, validTill.WallTime))
                        }
                        ms.Add(updateStatsOnGC(unsafeKey.Key, MVCCVersionTimestampSize, int64(valueLen), false, /* metaKey */
                                validTill.WallTime))
                }
                prevPointKey.Timestamp = unsafeKey.Timestamp
                removedEntries++
        }

        // If timestamp is not empty we delete subset of versions (this may be first
        // key of requested range or full extent).
        if err := rw.ClearMVCCVersions(MVCCKey{Key: start, Timestamp: startTimestamp}, MVCCKey{Key: end}); err != nil {
                return err
        }
        return nil
}

// MVCCFindSplitKey finds a key from the given span such that the left side of
// the split is roughly targetSize bytes. It only considers MVCC point keys, not
// range keys. The returned key will never be chosen from the key ranges listed
// in keys.NoSplitSpans.
func MVCCFindSplitKey(
        ctx context.Context, reader Reader, key, endKey roachpb.RKey, targetSize int64,
) (roachpb.Key, error) {
        if key.Less(roachpb.RKey(keys.LocalMax)) {
                key = roachpb.RKey(keys.LocalMax)
        }

        it, err := reader.NewMVCCIterator(
                ctx, MVCCKeyAndIntentsIterKind, IterOptions{
                        UpperBound:   endKey.AsRawKey(),
                        ReadCategory: fs.BatchEvalReadCategory,
                })
        if err != nil {
                return nil, err
        }
        defer it.Close()

        // We want to avoid splitting at the first key in the range because that
        // could result in an empty left-hand range. To prevent this, we scan for
        // the first key in the range and consider the key that sorts directly after
        // this as the minimum split key.
        //
        // In addition, we must never return a split key that falls within a table
        // row. (Rows in tables with multiple column families are comprised of
        // multiple keys, one key per column family.)
        //
        // Managing this is complicated: the logic for picking a split key that
        // creates ranges of the right size lives in C++, while the logic for
        // determining whether a key falls within a table row lives in Go.
        //
        // Most of the time, we can let C++ pick whatever key it wants. If it picks a
        // key in the middle of a row, we simply rewind the key to the start of the
        // row. This is handled by keys.EnsureSafeSplitKey.
        //
        // If, however, that first row in the range is so large that it exceeds the
        // range size threshold on its own, and that row is comprised of multiple
        // column families, we have a problem. C++ will hand us a key in the middle of
        // that row, keys.EnsureSafeSplitKey will rewind the key to the beginning of
        // the row, and... we'll end up with what's likely to be the start key of the
        // range. The higher layers of the stack will take this to mean that no splits
        // are required, when in fact the range is desperately in need of a split.
        //
        // Note that the first range of a table or a partition of a table does not
        // start on a row boundary and so we have a slightly different problem.
        // Instead of not splitting the range at all, we'll create a split at the
        // start of the first row, resulting in an unnecessary empty range from the
        // beginning of the table to the first row in the table (e.g., from /Table/51
        // to /Table/51/1/aardvark...). The right-hand side of the split will then be
        // susceptible to never being split as outlined above.
        //
        // To solve both of these problems, we find the end of the first row in Go,
        // then plumb that to C++ as a "minimum split key." We're then guaranteed that
        // the key C++ returns will rewind to the start key of the range.
        //
        // On a related note, we find the first row by actually looking at the first
        // key in the range. A previous version of this code attempted to derive
        // the first row only by looking at `key`, the start key of the range; this
        // was dangerous because partitioning can split off ranges that do not start
        // at valid row keys. The keys that are present in the range, by contrast, are
        // necessarily valid row keys.
        minSplitKey, err := mvccMinSplitKey(it, key.AsRawKey())
        if err != nil {
                return nil, err
        } else if minSplitKey == nil {
                return nil, nil
        }

        splitKey, err := it.FindSplitKey(key.AsRawKey(), endKey.AsRawKey(), minSplitKey, targetSize)
        if err != nil {
                return nil, err
        }
        // Ensure the key is a valid split point that does not fall in the middle of a
        // SQL row by removing the column family ID, if any, from the end of the key.
        return keys.EnsureSafeSplitKey(splitKey.Key)
}

// mvccMinSplitKey returns the minimum key that a range may be split at. The
// caller is responsible for setting the iterator upper bound to the range end
// key. The caller is also responsible for closing the iterator.
func mvccMinSplitKey(it MVCCIterator, startKey roachpb.Key) (roachpb.Key, error) {
        it.SeekGE(MakeMVCCMetadataKey(startKey))
        if ok, err := it.Valid(); err != nil {
                return nil, err
        } else if !ok {
                return nil, nil
        }
        var minSplitKey roachpb.Key
        if _, tenID, err := keys.DecodeTenantPrefix(it.UnsafeKey().Key); err == nil {
                if _, _, err := keys.MakeSQLCodec(tenID).DecodeTablePrefix(it.UnsafeKey().Key); err == nil {
                        // The first key in this range represents a row in a SQL table. Advance the
                        // minSplitKey past this row to avoid the problems described above.
                        firstRowKey, err := keys.EnsureSafeSplitKey(it.UnsafeKey().Key.Clone())
                        if err != nil {
                                return nil, err
                        }
                        // Allow a split key before other rows in the same table.
                        minSplitKey = firstRowKey.PrefixEnd()
                }
        }
        if minSplitKey == nil {
                // The first key in the range does not represent a row in a SQL table.
                // Allow a split at any key that sorts after it.
                minSplitKey = it.UnsafeKey().Key.Clone().Next()
        }
        return minSplitKey, nil
}

// MVCCFirstSplitKey returns the first key which is safe to split at and no
// less than desiredSplitKey in the range which spans [startKey,endKey). If a
// non-nil key is returned, it is safe to split at. If a nil key is returned, no
// safe split key could be determined. The safe split key returned is
// guaranteed to be:
//
//  1. Within [startKey,endKey).
//  2. No less than desiredSplitKey.
//  3. Greater than the first key in [startKey,endKey]; or greater than all the
//     first row's keys if a table range. .
//  4. Not in between the start and end of a row for table ranges.
//
// The returned split key is NOT guaranteed to be outside a no-split span, such
// as Meta2Max or Node Liveness.
func MVCCFirstSplitKey(
        ctx context.Context, reader Reader, desiredSplitKey, startKey, endKey roachpb.RKey,
) (roachpb.Key, error) {
        // If the start key of the range is within the meta1 key space, the range
        // cannot be split.
        if startKey.Less(roachpb.RKey(keys.LocalMax)) {
                return nil, nil
        }

        it, err := reader.NewMVCCIterator(
                ctx, MVCCKeyAndIntentsIterKind, IterOptions{
                        UpperBound:   endKey.AsRawKey(),
                        ReadCategory: fs.BatchEvalReadCategory,
                })
        if err != nil {
                return nil, err
        }
        defer it.Close()

        // If the caller has provided a desiredSplitKey less than the minimum split
        // key, we update the desired split key to be the minimum split key. This
        // prevents splitting before the first row in a Table range, which would
        // result in the LHS having now rows.
        minSplitKey, err := mvccMinSplitKey(it, startKey.AsRawKey())
        if err != nil {
                return nil, err
        } else if minSplitKey == nil {
                return nil, nil
        }
        var seekKey roachpb.Key
        if minSplitKey.Compare(desiredSplitKey.AsRawKey()) > 0 {
                seekKey = minSplitKey
        } else {
                seekKey = desiredSplitKey.AsRawKey()
        }

        it.SeekGE(MakeMVCCMetadataKey(seekKey))
        if ok, err := it.Valid(); err != nil {
                return nil, err
        } else if !ok {
                return nil, nil
        }

        return keys.EnsureSafeSplitKey(it.UnsafeKey().Key.Clone())
}

// willOverflow returns true iff adding both inputs would under- or overflow
// the 64 bit integer range.
func willOverflow(a, b int64) bool {
        // Morally MinInt64 < a+b < MaxInt64, but without overflows.
        // First make sure that a <= b. If not, swap them.
        if a > b {
                a, b = b, a
        }
        // Now b is the larger of the numbers, and we compare sizes
        // in a way that can never over- or underflow.
        if b > 0 {
                return a > math.MaxInt64-b
        }
        return math.MinInt64-b > a
}

// ComputeStats scans the given key span and computes MVCC stats. nowNanos
// specifies the wall time in nanoseconds since the epoch and is used to compute
// age-related stats quantities.
func ComputeStats(
        ctx context.Context, r Reader, start, end roachpb.Key, nowNanos int64,
) (enginepb.MVCCStats, error) {
        return ComputeStatsWithVisitors(ctx, r, start, end, nowNanos, ComputeStatsVisitors{})
}

// ComputeStatsVisitors holds a set of callbacks that are invoked on each key
// during stats computation.
type ComputeStatsVisitors struct {
        PointKey     func(MVCCKey, []byte) error
        RangeKey     func(MVCCRangeKeyValue) error
        LockTableKey func(LockTableKey, []byte) error
}

// ComputeStatsWithVisitors is like ComputeStats, but also takes callbacks that
// are invoked on each key.
func ComputeStatsWithVisitors(
        ctx context.Context,
        r Reader,
        start, end roachpb.Key,
        nowNanos int64,
        visitors ComputeStatsVisitors,
) (enginepb.MVCCStats, error) {
        ctx, sp := tracing.ChildSpan(ctx, "ComputeStatsWithVisitors")
        defer sp.Finish()
        if isLockTableKey(start) {
                return computeLockTableStatsWithVisitors(ctx, r, start, end, nowNanos, visitors.LockTableKey)
        }

        iter, err := r.NewMVCCIterator(ctx, MVCCKeyAndIntentsIterKind, IterOptions{
                KeyTypes:   IterKeyTypePointsAndRanges,
                LowerBound: start,
                UpperBound: end,
        })
        if err != nil {
                return enginepb.MVCCStats{}, err
        }
        defer iter.Close()
        iter.SeekGE(MVCCKey{Key: start})
        return computeStatsForIterWithVisitors(iter, nowNanos, visitors.PointKey, visitors.RangeKey)
}

// ComputeStatsForIter is like ComputeStats, but scans across the given iterator
// until exhausted. The iterator must have appropriate bounds, key types, and
// intent options set, and it must have been seeked to the appropriate starting
// point.
//
// We don't take start/end here, because that would require expensive key
// comparisons. We also don't seek to e.g. MinKey, because that might violate
// spanset assertions.
//
// Most callers should use ComputeStats() instead. This exists primarily for use
// with SST iterators.
func ComputeStatsForIter(iter SimpleMVCCIterator, nowNanos int64) (enginepb.MVCCStats, error) {
        return computeStatsForIterWithVisitors(iter, nowNanos, nil, nil)
}

// computeStatsForIterWithVisitors performs the actual stats computation for the
// other ComputeStats methods.
//
// The iterator must already have been seeked. This requirement is to comply
// with spanset assertions, such that ComputeStats can seek to the given start
// key (satisfying the spanset asserter), while ComputeStatsForIter can seek to
// MinKey (in effect the iterator's lower bound) as it's geared towards SST
// iterators which are not subject to spanset assertions.
//
// Notably, we do not want to take the start/end key here, and instead rely on
// the iterator's bounds, to avoid expensive key comparisons.
func computeStatsForIterWithVisitors(
        iter SimpleMVCCIterator,
        nowNanos int64,
        pointKeyVisitor func(MVCCKey, []byte) error,
        rangeKeyVisitor func(MVCCRangeKeyValue) error,
) (enginepb.MVCCStats, error) {
        var ms enginepb.MVCCStats
        var meta enginepb.MVCCMetadata
        var prevKey roachpb.Key
        var first bool

        // Values start accruing GCBytesAge at the timestamp at which they
        // are shadowed (i.e. overwritten) whereas deletion tombstones
        // use their own timestamp. We're iterating through versions in
        // reverse chronological order and use this variable to keep track
        // of the point in time at which the current key begins to age.
        var accrueGCAgeNanos int64
        var rangeTombstones MVCCRangeKeyVersions

        for ; ; iter.Next() {
                if ok, err := iter.Valid(); err != nil {
                        return ms, err
                } else if !ok {
                        break
                }

                // Process MVCC range tombstones, and buffer them in rangeTombstones
                // for all overlapping point keys.
                if iter.RangeKeyChanged() {
                        if hasPoint, hasRange := iter.HasPointAndRange(); hasRange {
                                rangeKeys := iter.RangeKeys()
                                rangeKeys.Versions.CloneInto(&rangeTombstones)

                                for i, v := range rangeTombstones {
                                        // Only the top-most fragment contributes the key and its bounds, but
                                        // all versions contribute timestamps and values.
                                        //
                                        // NB: Point keys always use 12 bytes for the key timestamp, even
                                        // though it is actually variable-length, likely for historical
                                        // reasons. But for range keys we may as well use the actual
                                        // variable-length encoded size.
                                        keyBytes := int64(EncodedMVCCTimestampSuffixLength(v.Timestamp))
                                        valBytes := int64(len(v.Value))
                                        if i == 0 {
                                                ms.RangeKeyCount++
                                                keyBytes += int64(EncodedMVCCKeyPrefixLength(rangeKeys.Bounds.Key) +
                                                        EncodedMVCCKeyPrefixLength(rangeKeys.Bounds.EndKey))
                                        }
                                        ms.RangeKeyBytes += keyBytes
                                        ms.RangeValCount++
                                        ms.RangeValBytes += valBytes
                                        ms.GCBytesAge += (keyBytes + valBytes) * (nowNanos/1e9 - v.Timestamp.WallTime/1e9)

                                        if rangeKeyVisitor != nil {
                                                if err := rangeKeyVisitor(rangeKeys.AsRangeKeyValue(v)); err != nil {
                                                        return enginepb.MVCCStats{}, err
                                                }
                                        }
                                }

                                if !hasPoint {
                                        continue
                                }
                        } else {
                                rangeTombstones.Clear()
                        }
                }

                unsafeKey := iter.UnsafeKey()

                if pointKeyVisitor != nil {
                        // NB: pointKeyVisitor is typically nil, so we will typically not call
                        // iter.UnsafeValue().
                        v, err := iter.UnsafeValue()
                        if err != nil {
                                return enginepb.MVCCStats{}, err
                        }
                        if err := pointKeyVisitor(unsafeKey, v); err != nil {
                                return enginepb.MVCCStats{}, err
                        }
                }

                isSys := isSysLocal(unsafeKey.Key)
                if isSys {
                        // Check for ignored keys.
                        if bytes.HasPrefix(unsafeKey.Key, keys.LocalRangeIDPrefix) {
                                // RangeID-local key.
                                _ /* rangeID */, infix, suffix, _ /* detail */, err := keys.DecodeRangeIDKey(unsafeKey.Key)
                                if err != nil {
                                        return enginepb.MVCCStats{}, errors.Wrap(err, "unable to decode rangeID key")
                                }

                                if infix.Equal(keys.LocalRangeIDReplicatedInfix) {
                                        // Replicated RangeID-local key.
                                        if suffix.Equal(keys.LocalRangeAppliedStateSuffix) {
                                                // RangeAppliedState key. Ignore.
                                                continue
                                        }
                                }
                        }

                        // Check for lock table keys, which are not handled by this
                        // function. They are handled by computeLockTableStatsWithVisitors
                        // instead.
                        if bytes.HasPrefix(unsafeKey.Key, keys.LocalRangeLockTablePrefix) {
                                return enginepb.MVCCStats{}, errors.AssertionFailedf(
                                        "lock table key encountered by ComputeStats: %s", unsafeKey.Key)
                        }
                }

                isValue := unsafeKey.IsValue()
                implicitMeta := isValue && !bytes.Equal(unsafeKey.Key, prevKey)
                prevKey = append(prevKey[:0], unsafeKey.Key...)

                // Find the closest range tombstone above the point key. Range tombstones
                // cannot exist above intents, and are undefined across inline values, so we
                // only take them into account for versioned values.
                //
                // TODO(erikgrinaker): Rather than doing a full binary search for each
                // point, we can keep track of the current index and move downwards in the
                // stack as we descend through older versions, resetting once we hit a new
                // key.
                var nextRangeTombstone hlc.Timestamp
                if isValue {
                        if !rangeTombstones.IsEmpty() && unsafeKey.Timestamp.LessEq(rangeTombstones.Newest()) {
                                if v, ok := rangeTombstones.FirstAtOrAbove(unsafeKey.Timestamp); ok {
                                        nextRangeTombstone = v.Timestamp
                                }
                        }
                }

                var valueLen int
                var mvccValueIsTombstone bool
                if isValue {
                        // MVCC value
                        var err error
                        valueLen, mvccValueIsTombstone, err = iter.MVCCValueLenAndIsTombstone()
                        if err != nil {
                                return enginepb.MVCCStats{}, errors.Wrap(err, "unable to decode MVCCValue")
                        }
                } else {
                        valueLen = iter.ValueLen()
                }
                if implicitMeta {
                        // INVARIANT: implicitMeta => isValue.
                        // No MVCCMetadata entry for this series of keys.
                        meta.Reset()
                        meta.KeyBytes = MVCCVersionTimestampSize
                        meta.ValBytes = int64(valueLen)
                        meta.Deleted = mvccValueIsTombstone
                        meta.Timestamp.WallTime = unsafeKey.Timestamp.WallTime
                }

                if !isValue || implicitMeta {
                        metaKeySize := int64(len(unsafeKey.Key)) + 1
                        var metaValSize int64
                        if !implicitMeta {
                                metaValSize = int64(valueLen)
                        }
                        totalBytes := metaKeySize + metaValSize
                        first = true

                        if !implicitMeta {
                                v, err := iter.UnsafeValue()
                                if err != nil {
                                        return enginepb.MVCCStats{}, err
                                }
                                if err := protoutil.Unmarshal(v, &meta); err != nil {
                                        return ms, errors.Wrap(err, "unable to decode MVCCMetadata")
                                }
                        }

                        if isSys {
                                ms.SysBytes += totalBytes
                                ms.SysCount++
                                if isAbortSpanKey(unsafeKey.Key) {
                                        ms.AbortSpanBytes += totalBytes
                                }
                        } else {
                                if meta.Deleted {
                                        // First value is deleted, so it's GC'able; add meta key & value bytes to age stat.
                                        ms.GCBytesAge += totalBytes * (nowNanos/1e9 - meta.Timestamp.WallTime/1e9)
                                } else if nextRangeTombstone.IsSet() {
                                        // First value was deleted by a range tombstone, so it accumulates GC age from
                                        // the range tombstone's timestamp.
                                        ms.GCBytesAge += totalBytes * (nowNanos/1e9 - nextRangeTombstone.WallTime/1e9)
                                } else {
                                        ms.LiveBytes += totalBytes
                                        ms.LiveCount++
                                }
                                ms.KeyBytes += metaKeySize
                                ms.ValBytes += metaValSize
                                ms.KeyCount++
                                if meta.IsInline() {
                                        ms.ValCount++
                                }
                        }
                        if !implicitMeta {
                                continue
                        }
                }

                totalBytes := int64(valueLen) + MVCCVersionTimestampSize
                if isSys {
                        ms.SysBytes += totalBytes
                } else {
                        if first {
                                first = false
                                if meta.Deleted {
                                        // First value is deleted, so it's GC'able; add key & value bytes to age stat.
                                        ms.GCBytesAge += totalBytes * (nowNanos/1e9 - meta.Timestamp.WallTime/1e9)
                                } else if nextRangeTombstone.IsSet() {
                                        // First value was deleted by a range tombstone; add key & value bytes to
                                        // age stat from range tombstone onwards.
                                        ms.GCBytesAge += totalBytes * (nowNanos/1e9 - nextRangeTombstone.WallTime/1e9)
                                } else {
                                        ms.LiveBytes += totalBytes
                                }
                                if meta.Txn != nil {
                                        ms.IntentBytes += totalBytes
                                        ms.IntentCount++
                                        ms.LockCount++
                                        ms.LockAge += nowNanos/1e9 - meta.Timestamp.WallTime/1e9
                                }
                                if meta.KeyBytes != MVCCVersionTimestampSize {
                                        return ms, errors.Errorf("expected mvcc metadata key bytes to equal %d; got %d "+
                                                "(meta: %s)", MVCCVersionTimestampSize, meta.KeyBytes, &meta)
                                }
                                if meta.ValBytes != int64(valueLen) {
                                        return ms, errors.Errorf("expected mvcc metadata val bytes to equal %d; got %d "+
                                                "(meta: %s)", valueLen, meta.ValBytes, &meta)
                                }
                                accrueGCAgeNanos = meta.Timestamp.WallTime
                        } else {
                                // Overwritten value. Is it a deletion tombstone?
                                if mvccValueIsTombstone {
                                        // The contribution of the tombstone picks up GCByteAge from its own timestamp on.
                                        ms.GCBytesAge += totalBytes * (nowNanos/1e9 - unsafeKey.Timestamp.WallTime/1e9)
                                } else if nextRangeTombstone.IsSet() && nextRangeTombstone.WallTime < accrueGCAgeNanos {
                                        // The kv pair was deleted by a range tombstone below the next
                                        // version, so it accumulates garbage from the range tombstone.
                                        ms.GCBytesAge += totalBytes * (nowNanos/1e9 - nextRangeTombstone.WallTime/1e9)
                                } else {
                                        // The kv pair is an overwritten value, so it became non-live when the closest more
                                        // recent value was written.
                                        ms.GCBytesAge += totalBytes * (nowNanos/1e9 - accrueGCAgeNanos/1e9)
                                }
                                // Update for the next version we may end up looking at.
                                accrueGCAgeNanos = unsafeKey.Timestamp.WallTime
                        }
                        ms.KeyBytes += MVCCVersionTimestampSize
                        ms.ValBytes += int64(valueLen)
                        ms.ValCount++
                }
        }

        ms.LastUpdateNanos = nowNanos
        return ms, nil
}

// computeLockTableStatsWithVisitors performs stats computation for the lock
// table keys in the given span. It is split off from the main ComputeStats
// logic because lock table iteration requires an EngineIterator (which is
// wrapped in a LockTableIterator), while the main ComputeStats logic uses an
// (intent interleaving) MVCCIterator.
//
// Unlike computeStatsForIterWithVisitors, this function accepts a Reader and
// a start and end key. The start and end key must both be lock table keys.
func computeLockTableStatsWithVisitors(
        ctx context.Context,
        r Reader,
        start, end roachpb.Key,
        nowNanos int64,
        lockTableKeyVisitor func(LockTableKey, []byte) error,
) (enginepb.MVCCStats, error) {
        iter, err := NewLockTableIterator(ctx, r, LockTableIteratorOptions{
                LowerBound:  start,
                UpperBound:  end,
                MatchMinStr: lock.Shared, // all locks
        })
        if err != nil {
                return enginepb.MVCCStats{}, err
        }
        defer iter.Close()

        var ms enginepb.MVCCStats
        var meta enginepb.MVCCMetadata
        var ok bool
        for ok, err = iter.SeekEngineKeyGE(EngineKey{Key: start}); ok; ok, err = iter.NextEngineKey() {
                key, err := iter.UnsafeLockTableKey()
                if err != nil {
                        return enginepb.MVCCStats{}, err
                }
                if key.Strength == lock.Intent {
                        // The contributions of intents to the MVCCStats are handled by
                        // computeStatsForIterWithVisitors, which uses an intent
                        // interleaving iterator to interpret the mvcc keyspace. That
                        // function draws a distinction between provisional versioned values
                        // that are associated with intents and committed versioned values
                        // that are not.
                        //
                        // For simplicity, we ignore intents in this function.
                        continue
                }
                val, err := iter.UnsafeValue()
                if err != nil {
                        return enginepb.MVCCStats{}, err
                }
                if err := protoutil.Unmarshal(val, &meta); err != nil {
                        return ms, errors.Wrap(err, "unable to decode MVCCMetadata")
                }

                if lockTableKeyVisitor != nil {
                        if err := lockTableKeyVisitor(key, val); err != nil {
                                return enginepb.MVCCStats{}, err
                        }
                }

                keyBytes := key.EncodedSize()
                valBytes := int64(len(val))

                ms.LockBytes += keyBytes + valBytes
                ms.LockCount++
                ms.LockAge += nowNanos/1e9 - meta.Timestamp.WallTime/1e9
        }
        if err != nil {
                return enginepb.MVCCStats{}, err
        }

        ms.LastUpdateNanos = nowNanos
        return ms, nil
}

// MVCCIsSpanEmptyOptions configures the MVCCIsSpanEmpty function.
type MVCCIsSpanEmptyOptions struct {
        // StartKey determines start of the checked span.
        StartKey roachpb.Key
        // EndKey determines the end of exported interval (exclusive).
        EndKey roachpb.Key
        // StartTS and EndTS determine the scanned time range as (startTS, endTS].
        StartTS, EndTS hlc.Timestamp
}

// MVCCIsSpanEmpty returns true if there are no MVCC keys whatsoever in the key
// span in the requested time interval. If a time interval is given and any
// inline values are encountered, an error may be returned.
func MVCCIsSpanEmpty(
        ctx context.Context, reader Reader, opts MVCCIsSpanEmptyOptions,
) (isEmpty bool, _ error) {
        // Only use an MVCCIncrementalIterator if time bounds are given, since it will
        // error on any inline values, and the caller may want to respect them instead.
        var iter SimpleMVCCIterator
        if opts.StartTS.IsEmpty() && opts.EndTS.IsEmpty() {
                var err error
                iter, err = reader.NewMVCCIterator(ctx, MVCCKeyAndIntentsIterKind, IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        LowerBound:   opts.StartKey,
                        UpperBound:   opts.EndKey,
                        ReadCategory: fs.BatchEvalReadCategory,
                })
                if err != nil {
                        return false, err
                }
        } else {
                var err error
                iter, err = NewMVCCIncrementalIterator(ctx, reader, MVCCIncrementalIterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        StartKey:     opts.StartKey,
                        EndKey:       opts.EndKey,
                        StartTime:    opts.StartTS,
                        EndTime:      opts.EndTS,
                        IntentPolicy: MVCCIncrementalIterIntentPolicyEmit,
                        ReadCategory: fs.BatchEvalReadCategory,
                })
                if err != nil {
                        return false, err
                }
        }
        defer iter.Close()
        iter.SeekGE(MVCCKey{Key: opts.StartKey})
        valid, err := iter.Valid()
        if err != nil {
                return false, err
        }
        return !valid, nil
}

// MVCCExportFingerprint exports a fingerprint for point keys in the keyrange
// [StartKey, EndKey) over the interval (StartTS, EndTS]. Each key/timestamp and
// value is hashed using a fnv64 hasher, and combined into a running aggregate
// via a XOR. On completion of the export this aggregate is returned as the
// fingerprint.
//
// Range keys are not fingerprinted but instead written to a pebble SST that is
// returned to the caller. This is because range keys do not have a stable,
// discrete identity and so it is up to the caller to define a deterministic
// fingerprinting scheme across all returned range keys. The returned boolean
// indicates whether any rangekeys were encountered during the export, this bool
// is used by the caller to throw away the empty SST file and avoid unnecessary
// allocations.
func MVCCExportFingerprint(
        ctx context.Context, cs *cluster.Settings, reader Reader, opts MVCCExportOptions, dest io.Writer,
) (kvpb.BulkOpSummary, ExportRequestResumeInfo, uint64, bool, error) {
        ctx, span := tracing.ChildSpan(ctx, "storage.MVCCExportFingerprint")
        defer span.Finish()

        hasher := fnv.New64()
        fingerprintWriter := makeFingerprintWriter(ctx, hasher, cs, dest, opts.FingerprintOptions)
        defer fingerprintWriter.Close()

        summary, resumeInfo, exportErr := mvccExportToWriter(ctx, reader, opts, &fingerprintWriter)
        if exportErr != nil {
                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, 0, false, exportErr
        }

        fingerprint, err := fingerprintWriter.Finish()
        if err != nil {
                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, 0, false, err
        }

        hasRangeKeys := fingerprintWriter.sstWriter.DataSize != 0

        return summary, resumeInfo, fingerprint, hasRangeKeys, nil
}

// MVCCExportToSST exports changes to the keyrange [StartKey, EndKey) over the
// interval (StartTS, EndTS] as a Pebble SST. See mvccExportToWriter for more
// details.
func MVCCExportToSST(
        ctx context.Context, cs *cluster.Settings, reader Reader, opts MVCCExportOptions, dest io.Writer,
) (kvpb.BulkOpSummary, ExportRequestResumeInfo, error) {
        ctx, span := tracing.ChildSpan(ctx, "storage.MVCCExportToSST")
        defer span.Finish()
        sstWriter := MakeTransportSSTWriter(ctx, cs, dest)
        defer sstWriter.Close()

        summary, resumeInfo, exportErr := mvccExportToWriter(ctx, reader, opts, &sstWriter)
        if exportErr != nil {
                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, exportErr
        }

        if summary.DataSize == 0 {
                // If no records were added to the sstable, skip
                // completing it and return an empty summary.
                //
                // We still propagate the resumeKey because our iteration may have been
                // halted because of resource limitations before any keys were added to the
                // returned SST. We also propagate the error because an
                // ExportOverElasticCPULimitError is used to signal that we should paginate
                // and return a response to the client, instead of retrying immediately.
                return summary, resumeInfo, exportErr
        }

        if err := sstWriter.Finish(); err != nil {
                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, err
        }

        return summary, resumeInfo, nil
}

// ExportWriter is a trimmed down version of the Writer interface. It contains
// only those methods used during ExportRequest command evaluation.
type ExportWriter interface {
        // PutRawMVCCRangeKey writes an MVCC range key with the provided encoded
        // MVCCValue. It will replace any overlapping range keys at the given
        // timestamp (even partial overlap). Only MVCC range tombstones, i.e. an empty
        // value, are currently allowed (other kinds will need additional handling in
        // MVCC APIs and elsewhere, e.g. stats and GC). It can be used to avoid
        // decoding and immediately re-encoding an MVCCValue, but should generally be
        // avoided due to the lack of type safety.
        //
        // It is safe to modify the contents of the arguments after PutRawMVCCRangeKey
        // returns.
        PutRawMVCCRangeKey(MVCCRangeKey, []byte) error
        // PutRawMVCC sets the given key to the encoded MVCCValue. It requires that
        // the timestamp is non-empty (see {PutUnversioned,PutIntent} if the timestamp
        // is empty). It can be used to avoid decoding and immediately re-encoding an
        // MVCCValue, but should generally be avoided due to the lack of type safety.
        //
        // It is safe to modify the contents of the arguments after PutRawMVCC
        // returns.
        PutRawMVCC(key MVCCKey, value []byte) error
        // PutUnversioned sets the given key to the value provided. It is for use
        // with inline metadata (not intents) and other unversioned keys (like
        // Range-ID local keys).
        //
        // It is safe to modify the contents of the arguments after Put returns.
        PutUnversioned(key roachpb.Key, value []byte) error
}

type ExportRequestResumeInfo struct {
        ResumeKey    MVCCKey
        CPUOverlimit bool
}

// mvccExportToWriter exports changes to the keyrange [StartKey, EndKey) over
// the interval (StartTS, EndTS] to the passed in writer. See MVCCExportOptions
// for options. StartTS may be zero.
//
// This comes in two principal flavors: all revisions or latest revision only.
// In all-revisions mode, exports everything matching the span and time bounds,
// i.e. extracts contiguous blocks of MVCC history. In latest-revision mode,
// extracts just the changes necessary to transform an MVCC snapshot at StartTS
// into one equivalent to the data at EndTS, but without including all
// intermediate revisions not visible at EndTS. The latter mode is used for
// incremental backups that can only be restored to EndTS, the former allows
// restoring to any intermediate timestamp.
//
// Tombstones (both point and MVCC range tombstones) are treated like revisions.
// That is, if all revisions are requested, all tombstones in (StartTS, EndTS]
// and overlapping [StartKey, EndKey) are returned. If only the latest revision
// is requested, only the most recent matching tombstone is returned.
//
// Intents within the time and span bounds will return a LockConflictError, while
// intents outside are ignored.
//
// Returns an export summary and a resume key that allows resuming the export if
// it reached a limit. Data is written to the writer as it is collected. If an
// error is returned then the writer's contents are undefined. It is the
// responsibility of the caller to Finish() / Close() the passed in writer.
func mvccExportToWriter(
        ctx context.Context, reader Reader, opts MVCCExportOptions, writer ExportWriter,
) (kvpb.BulkOpSummary, ExportRequestResumeInfo, error) {
        // If we're not exporting all revisions then we can mask point keys below any
        // MVCC range tombstones, since we don't care about them.
        var rangeKeyMasking hlc.Timestamp
        if !opts.ExportAllRevisions {
                rangeKeyMasking = opts.EndTS
        }

        elasticCPUHandle := admission.ElasticCPUWorkHandleFromContext(ctx)
        // NB: StartTimer is used to denote that we're just starting to do
        // the actual on-CPU work we acquired CPU tokens for. We've seen that before
        // hitting his code path, we may have already used up our allotted CPU slice
        // resolving intents or doing conflict resolution. The effect was that
        // OverLimit() below is immediately true, and we exported just a single key
        // for the entire request, making for extremely inefficient backups. By
        // starting the timer here we guarantee that our allotted CPU slice is spent
        // actually doing the backup work.
        elasticCPUHandle.StartTimer()
        startTime := timeutil.Now()

        iter, err := NewMVCCIncrementalIterator(ctx, reader, MVCCIncrementalIterOptions{
                KeyTypes:                IterKeyTypePointsAndRanges,
                StartKey:                opts.StartKey.Key,
                EndKey:                  opts.EndKey,
                StartTime:               opts.StartTS,
                EndTime:                 opts.EndTS,
                RangeKeyMaskingBelow:    rangeKeyMasking,
                IntentPolicy:            MVCCIncrementalIterIntentPolicyAggregate,
                ReadCategory:            fs.BackupReadCategory,
                MaxLockConflicts:        opts.MaxLockConflicts,
                TargetLockConflictBytes: opts.TargetLockConflictBytes,
        })
        if err != nil {
                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, err
        }
        defer func() {
                if opts.ScanStats != nil {
                        recordIteratorStats(iter, opts.ScanStats)
                        opts.ScanStats.NumScans++
                }
                // ExportRequests can sometimes be slow and exceed the deadline.
                // Explicitly log the iterator stats if canceled.
                if log.V(1) {
                        select {
                        case <-ctx.Done():
                                stats := iter.Stats()
                                elapsed := timeutil.Since(startTime)
                                preWorkCPUTime, workCPUTime := elasticCPUHandle.RunningTime()
                                log.Errorf(ctx,
                                        "export exceeded deadline work wall: %v, cpu: %v, pre-work-cpu: %v, stats: %v",
                                        elapsed, workCPUTime, preWorkCPUTime, &stats.Stats)
                        default:
                        }
                }
                iter.Close()
        }()

        paginated := opts.TargetSize > 0
        hasElasticCPULimiter := elasticCPUHandle != nil
        // trackKeyBoundary is true if we need to know whether the
        // iteration has proceeded to a new key.
        //
        // If opts.ExportAllRevisions is false, then our iteration loop
        // will use NextKey() and thus will always be on a new key.
        //
        // If opts.ExportAllRevisions is true, we only need to track
        // key boundaries if we may return from our iteration before
        // the EndKey. This can happen if the user has requested
        // paginated results, or if we hit a resource limit.
        trackKeyBoundary := opts.ExportAllRevisions && (paginated || hasElasticCPULimiter)
        firstIteration := true
        // skipTombstones controls whether we include tombstones.
        //
        // We want tombstones if we are exporting all revisions or if
        // we have a StartTS. A non-empty StartTS is used by
        // incremental backups and thus needs to see tombstones if
        // that happens to be the latest value.
        skipTombstones := !opts.ExportAllRevisions && opts.StartTS.IsEmpty()

        var rows RowCounter
        // Only used if trackKeyBoundary is true.
        var curKey roachpb.Key

        var resumeKey MVCCKey
        var resumeIsCPUOverLimit bool

        var rangeKeys MVCCRangeKeyStack
        var rangeKeysSize int64

        // maxRangeKeysSizeIfTruncated calculates the worst-case size of the currently
        // buffered rangeKeys if we were to stop iteration after the current resumeKey
        // and flush the pending range keys with the new truncation bound given by
        // resumeKey.
        //
        // For example, if we've buffered the range keys [a-c)@2 and [a-c)@1 with
        // total size 4 bytes, and then hit a byte limit between the point keys bbb@3
        // and bbb@1, we have to truncate the two range keys to [a-bbb\0)@1 and
        // [a-bbb\0)@2. The size of the flushed range keys is now 10 bytes, not 4
        // bytes. Since we're never allowed to exceed MaxSize, we have to check before
        // adding bbb@3 that we have room for both bbb@3 and the pending range keys if
        // they were to be truncated and flushed after it.
        //
        // This could either truncate the range keys at resumeKey, at resumeKey.Next()
        // if StopMidKey is enabled, or at the range keys' actual end key if there
        // doesn't end up being any further point keys covered by it and we go on to
        // flush them as-is at their normal end key. We need to make sure we have
        // enough MaxSize budget to flush them in all of these cases.
        maxRangeKeysSizeIfTruncated := func(resumeKey roachpb.Key) int64 {
                if rangeKeysSize == 0 {
                        return 0
                }
                // We could be truncated in the middle of a point key version series, which
                // would require adding on a \0 byte via Key.Next(), so let's assume that.
                maxSize := rangeKeysSize
                endKeySize := len(rangeKeys.Bounds.EndKey)
                if s := maxSize + int64(rangeKeys.Len()*(len(resumeKey)-endKeySize+1)); s > maxSize {
                        maxSize = s
                }
                return maxSize
        }

        var valueScratch []byte
        iter.SeekGE(opts.StartKey)
        for {
                if ok, err := iter.Valid(); err != nil {
                        return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, err
                } else if !ok {
                        break
                } else if iter.TryGetIntentError() != nil {
                        break
                }

                unsafeKey := iter.UnsafeKey()

                // isNewKey is true when we aren't tracking key
                // boundaries because either we are not exporting all
                // revisions or because we know we won't stop before a
                // key boundary anyway.
                isNewKey := !trackKeyBoundary || !unsafeKey.Key.Equal(curKey)
                if trackKeyBoundary && isNewKey {
                        curKey = append(curKey[:0], unsafeKey.Key...)
                }

                if firstIteration {
                        // Don't check resources on first iteration to ensure we can make some progress regardless
                        // of starvation. Otherwise operations could spin indefinitely.
                        firstIteration = false
                } else {
                        // Check if we're over our allotted CPU time + on a key boundary (we
                        // prefer callers being able to use SSTs directly). Going over limit is
                        // accounted for in admission control by penalizing the subsequent
                        // request, so doing it slightly is fine.
                        stopAllowed := isNewKey || opts.StopMidKey
                        if overLimit, _ := elasticCPUHandle.OverLimit(); overLimit && stopAllowed {
                                resumeKey = unsafeKey.Clone()
                                if isNewKey {
                                        resumeKey.Timestamp = hlc.Timestamp{}
                                }
                                resumeIsCPUOverLimit = true
                                break
                        }
                }

                // When we encounter an MVCC range tombstone stack, we buffer it in
                // rangeKeys until we've moved past it or iteration ends (e.g. due to a
                // limit). If we return a resume key then we need to truncate the final
                // range key stack (and thus the SST) to the resume key, so we can't flush
                // them until we've moved past.
                if iter.RangeKeyChanged() {
                        // Flush any pending range tombstones.
                        for _, v := range rangeKeys.Versions {
                                mvccValue, ok, err := tryDecodeSimpleMVCCValue(v.Value)
                                if !ok && err == nil {
                                        mvccValue, err = decodeExtendedMVCCValue(v.Value, false)
                                }
                                if err != nil {
                                        return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, errors.Wrapf(err,
                                                "decoding mvcc value %s", v.Value)
                                }
                                // Export only the inner roachpb.Value, not the MVCCValue header.
                                rawValue := mvccValue.Value.RawBytes
                                if err := writer.PutRawMVCCRangeKey(rangeKeys.AsRangeKey(v), rawValue); err != nil {
                                        return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, err
                                }
                        }
                        rows.BulkOpSummary.DataSize += rangeKeysSize
                        rangeKeys.Clear()
                        rangeKeysSize = 0

                        // Buffer any new range keys.
                        hasPoint, hasRange := iter.HasPointAndRange()
                        if hasRange && !skipTombstones {
                                if opts.ExportAllRevisions {
                                        iter.RangeKeys().CloneInto(&rangeKeys)
                                } else {
                                        rks := iter.RangeKeys()
                                        rks.Versions = rks.Versions[:1]
                                        rks.CloneInto(&rangeKeys)
                                }

                                for _, v := range rangeKeys.Versions {
                                        rangeKeysSize += int64(
                                                len(rangeKeys.Bounds.Key) + len(rangeKeys.Bounds.EndKey) + len(v.Value))
                                }

                                // Check if the range keys exceed a limit, using similar logic as point
                                // keys. We have to check both the size of the range keys as they are (in
                                // case we emit them as-is), and the size of the range keys if they were
                                // to be truncated at the start key due to a resume span (which could
                                // happen if the next point key exceeds the max size).
                                //
                                // TODO(erikgrinaker): The limit logic here is a bit of a mess, but we're
                                // complying with the existing point key logic for now. We should get rid
                                // of some of the options and clean this up.
                                curSize := rows.BulkOpSummary.DataSize
                                reachedTargetSize := opts.TargetSize > 0 && uint64(curSize) >= opts.TargetSize
                                newSize := curSize + maxRangeKeysSizeIfTruncated(rangeKeys.Bounds.Key)
                                reachedMaxSize := opts.MaxSize > 0 && newSize > int64(opts.MaxSize)
                                if curSize > 0 && paginated && (reachedTargetSize || reachedMaxSize) {
                                        rangeKeys.Clear()
                                        rangeKeysSize = 0
                                        resumeKey = unsafeKey.Clone()
                                        log.VInfof(ctx, 2, "paginating ExportRequest: rangekeys hit size limit: "+
                                                "reachedTargetSize: %t, reachedMaxSize: %t", reachedTargetSize, reachedMaxSize)
                                        break
                                }
                                if reachedMaxSize {
                                        return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, &ExceedMaxSizeError{
                                                reached: newSize, maxSize: opts.MaxSize}
                                }
                        }

                        // If we're on a bare range key, step forward. We can't use NextKey()
                        // because there may be a point key at the range key's start bound.
                        if !hasPoint {
                                iter.Next()
                                continue
                        }
                }

                // Process point keys.
                unsafeValue, err := iter.UnsafeValue()
                if err != nil {
                        return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, err
                }
                skip := false
                if unsafeKey.IsValue() {
                        mvccValue, ok, err := tryDecodeSimpleMVCCValue(unsafeValue)
                        if !ok && err == nil {
                                mvccValue, err = decodeExtendedMVCCValue(unsafeValue, opts.IncludeMVCCValueHeader)
                        }
                        if err != nil {
                                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, errors.Wrapf(err, "decoding mvcc value %s", unsafeKey)
                        }

                        if !ok && opts.IncludeMVCCValueHeader {
                                buf, canRetainBuf, err := EncodeMVCCValueForExport(mvccValue, valueScratch[:0])
                                if err != nil {
                                        return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, errors.Wrapf(err, "repackaging imported mvcc value %s", unsafeKey)
                                }
                                if canRetainBuf {
                                        valueScratch = buf
                                }
                                unsafeValue = buf
                        } else {
                                unsafeValue = mvccValue.Value.RawBytes
                        }
                        // Skip tombstone records when start time is zero (non-incremental)
                        // and we are not exporting all versions.
                        skip = skipTombstones && mvccValue.IsTombstone()
                }

                if !skip {
                        if err := rows.Count(unsafeKey.Key); err != nil {
                                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, errors.Wrapf(err, "decoding %s", unsafeKey)
                        }
                        curSize := rows.BulkOpSummary.DataSize
                        curSizeWithRangeKeys := curSize + maxRangeKeysSizeIfTruncated(unsafeKey.Key)
                        reachedTargetSize := curSizeWithRangeKeys > 0 &&
                                uint64(curSizeWithRangeKeys) >= opts.TargetSize
                        kvSize := int64(len(unsafeKey.Key) + len(unsafeValue))
                        if curSize == 0 && opts.MaxSize > 0 && kvSize > int64(opts.MaxSize) {
                                // This single key exceeds the MaxSize. Even if we paginate below, this will still fail.
                                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, &ExceedMaxSizeError{reached: kvSize, maxSize: opts.MaxSize}
                        }
                        newSize := curSize + kvSize
                        newSizeWithRangeKeys := curSizeWithRangeKeys + kvSize
                        reachedMaxSize := opts.MaxSize > 0 && newSizeWithRangeKeys > int64(opts.MaxSize)

                        // When paginating we stop writing in two cases:
                        // - target size is reached and we wrote all versions of a key
                        // - maximum size reached and we are allowed to stop mid key
                        if paginated && (isNewKey && reachedTargetSize || opts.StopMidKey && reachedMaxSize) {
                                resumeKey = unsafeKey.Clone()
                                if isNewKey || !opts.StopMidKey {
                                        resumeKey.Timestamp = hlc.Timestamp{}
                                }
                                log.VInfof(ctx, 2, "paginating ExportRequest: point keys hit size limit: "+
                                        "reachedTargetSize: %t, reachedMaxSize: %t", reachedTargetSize, reachedMaxSize)
                                break
                        }
                        if reachedMaxSize {
                                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, &ExceedMaxSizeError{
                                        reached: newSizeWithRangeKeys, maxSize: opts.MaxSize}
                        }
                        if unsafeKey.Timestamp.IsEmpty() {
                                // This should never be an intent since the incremental iterator returns
                                // an error when encountering intents.
                                if err := writer.PutUnversioned(unsafeKey.Key, unsafeValue); err != nil {
                                        return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, errors.Wrapf(err, "adding key %s", unsafeKey)
                                }
                        } else {
                                if err := writer.PutRawMVCC(unsafeKey, unsafeValue); err != nil {
                                        return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, errors.Wrapf(err, "adding key %s", unsafeKey)
                                }
                        }
                        rows.BulkOpSummary.DataSize = newSize
                }

                if opts.ExportAllRevisions {
                        iter.Next()
                } else {
                        iter.NextKey()
                }
        }

        // First check if we encountered an intent while iterating the data.
        // If we do it means this export can't complete and is aborted. We need to loop over remaining data
        // to collect all matching intents before returning them in an error to the caller.
        // TODO: move below logic inside MVCCIncrementalIterator, make the iterator advance for intent collection when an
        // intent is found.
        if iter.TryGetIntentError() != nil {
                // If we encounter an error during intent collection, bail out but return
                // an intent error. MVCCIncrementalIterator will enforce MaxLockConflicts
                // and return an error when exceeded.
                for ok, _ := iter.Valid(); ok; ok, _ = iter.Valid() {
                        iter.NextKey()
                }
                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, iter.TryGetIntentError()
        }

        // Flush any pending buffered range keys, truncated to the resume key (if
        // any). If there is a resume timestamp, i.e. when resuming in between two
        // versions, the range keys must cover the resume key too. This will cause the
        // next export's range keys to overlap with this one, e.g.: [a-f) with resume
        // key c@7 will export range keys [a-c\0) first, and then [c-f) when resuming,
        // which overlaps at [c-c\0).
        if !rangeKeys.IsEmpty() {
                // Calculate the new rangeKeysSize due to the new resume bounds.
                if len(resumeKey.Key) > 0 && rangeKeys.Bounds.EndKey.Compare(resumeKey.Key) > 0 {
                        oldEndLen := len(rangeKeys.Bounds.EndKey)
                        rangeKeys.Bounds.EndKey = resumeKey.Key
                        if resumeKey.Timestamp.IsSet() {
                                rangeKeys.Bounds.EndKey = rangeKeys.Bounds.EndKey.Next()
                        }
                        rangeKeysSize += int64(rangeKeys.Len() * (len(rangeKeys.Bounds.EndKey) - oldEndLen))
                }
                for _, v := range rangeKeys.Versions {
                        mvccValue, ok, err := tryDecodeSimpleMVCCValue(v.Value)
                        if !ok && err == nil {
                                mvccValue, err = decodeExtendedMVCCValue(v.Value, false)
                        }
                        if err != nil {
                                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, errors.Wrapf(err,
                                        "decoding mvcc value %s", v.Value)
                        }
                        // Export only the inner roachpb.Value, not the MVCCValue header.
                        rawValue := mvccValue.Value.RawBytes
                        if err := writer.PutRawMVCCRangeKey(rangeKeys.AsRangeKey(v), rawValue); err != nil {
                                return kvpb.BulkOpSummary{}, ExportRequestResumeInfo{}, err
                        }
                }
                rows.BulkOpSummary.DataSize += rangeKeysSize
        }

        return rows.BulkOpSummary, ExportRequestResumeInfo{ResumeKey: resumeKey, CPUOverlimit: resumeIsCPUOverLimit}, nil
}

// MVCCExportOptions contains options for MVCCExportToSST.
type MVCCExportOptions struct {
        // StartKey determines start of the exported interval (inclusive).
        // StartKey.Timestamp is either empty which represent starting from a potential
        // intent and continuing to versions or non-empty, which represents starting
        // from a particular version.
        StartKey MVCCKey
        // EndKey determines the end of exported interval (exclusive).
        EndKey roachpb.Key
        // StartTS and EndTS determine exported time range as (startTS, endTS].
        StartTS, EndTS hlc.Timestamp
        // If ExportAllRevisions is true export every revision of a key for the interval,
        // otherwise only the latest value within the interval is exported.
        ExportAllRevisions bool
        // If TargetSize is positive, it indicates that the export should produce SSTs
        // which are roughly target size. Specifically, it will return an SST such that
        // the last key is responsible for meeting or exceeding the targetSize, unless the
        // iteration has been stopped because of resource limitations.
        TargetSize uint64
        // If MaxSize is positive, it is an absolute maximum on byte size for the
        // returned sst. If it is the case that the versions of the last key will lead
        // to an SST that exceeds maxSize, an error will be returned. This parameter
        // exists to prevent creating SSTs which are too large to be used.
        MaxSize uint64
        // MaxLockConflicts specifies the number of locks (intents) to collect and
        // return in a LockConflictError. The caller will likely resolve the returned
        // intents and retry the call, which would be quadratic, so this significantly
        // reduces the overall number of scans.
        //
        // The zero value indicates no limit.
        MaxLockConflicts uint64
        // TargetLockConflictBytes specifies the size target of locks collected in
        // LockConflictError. The lock collection process will stop when collected locks
        // exceed this bytes limit. This prevents excessive allocation of memory from
        // large intents. 0 value disable this target on intent size.
        TargetLockConflictBytes uint64
        // If StopMidKey is false, once function reaches targetSize it would continue
        // adding all versions until it reaches next key or end of range. If true, it
        // would stop immediately when targetSize is reached and return the next versions
        // timestamp in resumeTs so that subsequent operation can pass it to firstKeyTs.
        //
        // NB: If the result contains MVCC range tombstones, this can cause MVCC range
        // tombstones in two subsequent SSTs to overlap. For example, given the range
        // tombstone [a-f)@5, if we stop between c@4 and c@2 and return a resume key c@2,
        // then the response will contain a truncated MVCC range tombstone [a-c\0)@5
        // which covers the point key at c, but resuming from c@2 will contain the
        // MVCC range tombstone [c-f)@5 which overlaps with the MVCC range tombstone
        // in the previous response in the interval [c-c\0)@5. This overlap will not
        // cause problems with multiplexed iteration using NewSSTIterator(), nor when
        // ingesting the SSTs via `AddSSTable`.
        StopMidKey bool
        // FingerprintOptions controls how fingerprints are generated
        // when using MVCCExportFingerprint.
        FingerprintOptions MVCCExportFingerprintOptions

        // IncludeMVCCValueHeader controls whether we include
        // MVCCValueHeaders in the exported data. When true, the
        // portions of the header appropriate for export are included
        // in the encoded values. Callers should be ready to decode
        // full MVCCValue's in this case.
        IncludeMVCCValueHeader bool

        // ScanStats, if set, is updated with iterator stats upon export success of
        // failure. Non-iterator stats i.e., {NumGets,NumReverseScans} are left
        // unchanged, and NumScans is incremented by 1.
        ScanStats *kvpb.ScanStats
}

type MVCCExportFingerprintOptions struct {
        // If StripTenantPrefix is true, keys that appear to be
        // tenant-prefixed have the tenant-prefix removed before
        // hashing.
        StripTenantPrefix bool
        // If StripValueChecksum is true, checksums are removed from
        // the value before hashing.
        StripValueChecksum bool
        // If StripIndexPrefixAndTimestamp is true, the key's timestamp and index
        // prefix are not hashed. Because the index prefix is stripped, this option
        // should only get used in the table key space.
        StripIndexPrefixAndTimestamp bool
}

// PeekRangeKeysLeft peeks for any range keys to the left of the given key.
// It returns the relative position of any range keys to the peek key, along
// with the (unsafe) range key stack:
//
// -1: range key to the left not touching the peek key, or no range key found.
//
//        0: range key to the left ends at the peek key.
//
// +1: range key to the left overlaps with the peek key, extending to the right.
func PeekRangeKeysLeft(iter MVCCIterator, peekKey roachpb.Key) (int, MVCCRangeKeyStack, error) {
        iter.SeekLT(MVCCKey{Key: peekKey})
        if ok, err := iter.Valid(); err != nil {
                return 0, MVCCRangeKeyStack{}, err
        } else if !ok {
                return -1, MVCCRangeKeyStack{}, nil
        } else if _, hasRange := iter.HasPointAndRange(); !hasRange {
                return -1, MVCCRangeKeyStack{}, nil
        }
        rangeKeys := iter.RangeKeys()
        return rangeKeys.Bounds.EndKey.Compare(peekKey), rangeKeys, nil
}

// PeekRangeKeysRight peeks for any range keys to the right of the given key.
// It returns the relative position of any range keys to the peek key, along
// with the (unsafe) range key stack:
//
// -1: range key to the right overlaps with the peek key, existing to the left.
//
//        0: range key to the right starts at the peek key.
//
// +1: range key to the right not touching the peek key, or no range key found.
func PeekRangeKeysRight(iter MVCCIterator, peekKey roachpb.Key) (int, MVCCRangeKeyStack, error) {
        iter.SeekGE(MVCCKey{Key: peekKey})
        if ok, err := iter.Valid(); err != nil {
                return 0, MVCCRangeKeyStack{}, err
        } else if !ok {
                return 1, MVCCRangeKeyStack{}, nil
        } else if _, hasRange := iter.HasPointAndRange(); !hasRange {
                return 1, MVCCRangeKeyStack{}, nil
        }
        rangeKeys := iter.RangeKeys()
        return rangeKeys.Bounds.Key.Compare(peekKey), rangeKeys, nil
}

// ReplacePointTombstonesWithRangeTombstones will replace existing point
// tombstones with equivalent point-sized range tombstones in the given span,
// updating stats as needed. Only the most recent version is considered.
// If end is nil, start.Next() is assumed.
//
// NB: The caller must disable spanset assertions for the reader, since we'll
// peek beyond the given bounds to adjust range key stats. We're not terribly
// concerned about any stats mismatches caused by these missing latches.
func ReplacePointTombstonesWithRangeTombstones(
        ctx context.Context, rw ReadWriter, ms *enginepb.MVCCStats, start, end roachpb.Key,
) error {
        // We don't want to emit DeleteRange rangefeed events, since these may be
        // below the resolved timestamp by now.
        rw = DisableOpLogger(rw)

        if keys.IsLocal(start) {
                return nil
        }
        if len(end) == 0 {
                end = start.Next()
        }

        iter, err := rw.NewMVCCIterator(ctx, MVCCKeyAndIntentsIterKind, IterOptions{
                KeyTypes:   IterKeyTypePointsAndRanges,
                Prefix:     end.Equal(start.Next()),
                LowerBound: start,
                UpperBound: end,
                // Don't bother with ReadCategory since this function is used in tests.
        })
        if err != nil {
                return err
        }
        defer iter.Close()

        var clearedKey MVCCKey
        var rangeKeys MVCCRangeKeyStack
        iter.SeekGE(MVCCKey{Key: start})
        for {
                if ok, err := iter.Valid(); err != nil {
                        return err
                } else if !ok {
                        break
                }

                if iter.RangeKeyChanged() {
                        iter.RangeKeys().CloneInto(&rangeKeys)
                }

                // Skip bare range keys.
                hasPoint, hasRange := iter.HasPointAndRange()
                if !hasPoint {
                        iter.Next()
                        continue
                }

                key := iter.UnsafeKey()

                // Skip intents and inline values, and system table keys which
                // might be watched by rangefeeds.
                if key.Timestamp.IsEmpty() || isWatchedSystemTable(key.Key) {
                        iter.NextKey()
                        continue
                }

                // Skip non-tombstone values.
                valueLen, isTombstone, err := iter.MVCCValueLenAndIsTombstone()
                if err != nil {
                        return err
                }
                if !isTombstone {
                        iter.NextKey()
                        continue
                }

                // Skip keys below range tombstones. We can't use range key masking because
                // we may need to see older versions of point keys that are below a range
                // tombstone.
                if hasRange && key.Timestamp.LessEq(rangeKeys.Newest()) {
                        iter.NextKey()
                        continue
                }

                // Clear the point key, and construct a meta record for stats.
                clearedMeta := &enginepb.MVCCMetadata{
                        KeyBytes:  MVCCVersionTimestampSize,
                        ValBytes:  int64(valueLen),
                        Deleted:   true,
                        Timestamp: key.Timestamp.ToLegacyTimestamp(),
                }
                clearedKey.Key = append(clearedKey.Key[:0], key.Key...)
                clearedKey.Timestamp = key.Timestamp
                clearedKeySize := int64(EncodedMVCCKeyPrefixLength(clearedKey.Key))
                if err := rw.ClearMVCC(key, ClearOptions{
                        ValueSizeKnown: true,
                        ValueSize:      uint32(valueLen),
                }); err != nil {
                        return err
                }

                // Step to the next key to look for an older version, and construct a meta
                // record for stats.
                var restoredMeta *enginepb.MVCCMetadata
                iter.Next()
                if ok, err := iter.Valid(); err != nil {
                        return err
                } else if ok {
                        if key = iter.UnsafeKey(); key.Key.Equal(clearedKey.Key) {
                                valueLen, isTombstone, err = iter.MVCCValueLenAndIsTombstone()
                                if err != nil {
                                        return err
                                }
                                restoredMeta = &enginepb.MVCCMetadata{
                                        KeyBytes:  MVCCVersionTimestampSize,
                                        ValBytes:  int64(valueLen),
                                        Deleted:   isTombstone,
                                        Timestamp: key.Timestamp.ToLegacyTimestamp(),
                                }
                                if _, hasRange := iter.HasPointAndRange(); hasRange {
                                        if v, ok := iter.RangeKeys().FirstAtOrAbove(key.Timestamp); ok {
                                                restoredMeta.Deleted = true
                                                restoredMeta.KeyBytes = 0
                                                restoredMeta.ValBytes = 0
                                                restoredMeta.Timestamp = v.Timestamp.ToLegacyTimestamp()
                                        }
                                }
                        }
                }

                if ms != nil {
                        var restoredKeySize, restoredNanos int64
                        if restoredMeta != nil {
                                restoredKeySize = clearedKeySize
                                restoredNanos = restoredMeta.Timestamp.WallTime
                        }
                        ms.Add(updateStatsOnClear(clearedKey.Key,
                                clearedKeySize, 0, restoredKeySize, 0, clearedMeta, restoredMeta, restoredNanos))
                }

                // Write the range tombstone, with proper stats.
                if err := MVCCDeleteRangeUsingTombstone(ctx, rw, ms,
                        clearedKey.Key, clearedKey.Key.Next(), clearedKey.Timestamp, hlc.ClockTimestamp{},
                        start.Prevish(roachpb.PrevishKeyLength), end.Next(), false, 0, 0, nil); err != nil {
                        return err
                }

                // If we restored a version at this key, step to the next key. Otherwise,
                // we're already on the next key.
                if restoredMeta != nil {
                        iter.NextKey()
                }
        }

        return nil
}

// In order to test the correctness of range deletion tombstones, we added a
// testing knob to replace point deletions with range deletion tombstones in
// some tests. Unfortunately, doing so affects the correctness of rangefeeds.
// The tests in question do not use rangefeeds, but some system functionality
// does use rangefeeds internally. The primary impact is that catch-up scans
// will miss deletes. That makes these issues rare and hard to detect. In order
// to deflake these tests, we avoid rewriting deletes on relevant system
// tables.
func isWatchedSystemTable(key roachpb.Key) bool {
        rem, _, err := keys.DecodeTenantPrefix(key)
        if err != nil { // allow unprefixed keys to pass through
                return false
        }
        _, tableID, _, err := keys.DecodeTableIDIndexID(rem)
        if err != nil { // allow keys which do not correspond to sql tables
                return false
        }
        switch tableID {
        case keys.SettingsTableID, keys.SpanConfigurationsTableID,
                keys.SQLInstancesTableID, keys.DescriptorTableID, keys.ZonesTableID:
                return true
        default:
                return false
        }
}

// MVCCLookupRangeKeyValue reads the value header for a range deletion on
// [key,endKey) at the specified timestamp. The range deletion is allowed to be
// fragmented (with identical value) and is allowed to extend out of
// [key,endKey). An error is returned if a matching range deletion cannot be
// found.
func MVCCLookupRangeKeyValue(
        ctx context.Context, reader Reader, key, endKey roachpb.Key, ts hlc.Timestamp,
) ([]byte, error) {
        it, err := reader.NewMVCCIterator(ctx, MVCCKeyIterKind, IterOptions{
                LowerBound:   key,
                UpperBound:   endKey,
                KeyTypes:     IterKeyTypeRangesOnly,
                ReadCategory: fs.RangefeedReadCategory,
        })
        if err != nil {
                return nil, err
        }
        defer it.Close()

        it.SeekGE(MVCCKey{Key: key})

        // Start by assuming that we've already seen [min, key) and now we're iterating
        // to fill this up to [min, endKey).
        span := roachpb.Span{
                Key:    roachpb.KeyMin,
                EndKey: append([]byte(nil), key...), // copy since we'll mutate this memory
        }
        first := true
        var val []byte
        for ; ; it.Next() {
                ok, err := it.Valid()
                if err != nil {
                        return nil, err
                }
                if !ok {
                        break
                }
                rkv, ok := it.RangeKeys().FirstAtOrAbove(ts)
                if !ok || rkv.Timestamp != ts {
                        return nil, errors.Errorf(
                                "gap [%s,...) in expected range deletion [%s,%s)", span.EndKey, key, endKey)
                }

                unsafeBounds := it.RangeBounds() // only valid until next call to iterator
                if !span.EndKey.Equal(unsafeBounds.Key) {
                        return nil, errors.Errorf(
                                "gap [%s,%s) in expected range deletion [%s,%s)", span.EndKey, unsafeBounds.Key, key, endKey,
                        )
                }

                if first {
                        val = append(val, rkv.Value...)
                        first = false
                } else if !bytes.Equal(val, rkv.Value) {
                        return nil, errors.Errorf(
                                "value change at %s in expected range deletion [%s,%s)", unsafeBounds.Key, key, endKey)
                }

                span.EndKey = append(span.EndKey[:0], unsafeBounds.EndKey...)
        }
        if !span.EndKey.Equal(endKey) {
                return nil, errors.Errorf(
                        "gap [%s,...) in expected range deletion [%s,%s)", span.EndKey, key, endKey)
        }
        // Made it!
        return val, nil
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"
        "fmt"

        "github.com/cockroachdb/cockroach/pkg/kv/kvpb"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/util"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/metamorphic"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/errors"
)

// mvccIncrementalIteratorMetamorphicTBI will randomly enable TBIs.
var mvccIncrementalIteratorMetamorphicTBI = metamorphic.ConstantWithTestBool(
        "mvcc-incremental-iter-tbi", true)

// MVCCIncrementalIterator iterates over the diff of the key range
// [startKey,endKey) and time range (startTime,endTime]. If a key was added or
// modified between startTime and endTime, the iterator will position at the
// most recent version (before or at endTime) of that key. If the key was most
// recently deleted, this is signaled with an empty value.
//
// Inline (unversioned) values are not supported, and may return an error or be
// omitted entirely. The iterator should not be used across such keys.
//
// Intents outside the time bounds are ignored. Intents inside the
// time bounds are handled according to the provided
// MVCCIncrementalIterIntentPolicy. By default, an error will be
// returned.
//
// Note: The endTime is inclusive to be consistent with the non-incremental
// iterator, where reads at a given timestamp return writes at that
// timestamp. The startTime is then made exclusive so that iterating time 1 to
// 2 and then 2 to 3 will only return values with time 2 once. An exclusive
// start time would normally make it difficult to scan timestamp 0, but
// CockroachDB uses that as a sentinel for key metadata anyway.
//
// Expected usage:
//
//        iter := NewMVCCIncrementalIterator(e, IterOptions{
//            StartTime:  startTime,
//            EndTime:    endTime,
//            UpperBound: endKey,
//        })
//        defer iter.Close()
//        for iter.SeekGE(startKey); ; iter.Next() {
//            ok, err := iter.Valid()
//            if !ok { ... }
//            [code using iter.Key() and iter.Value()]
//        }
//
// Note regarding the correctness of the time-bound iterator optimization:
//
// When using (t_s, t_e], say there is a version (committed or provisional)
// k@t where t is in that interval, that is visible to iter. All sstables
// containing k@t will be included in timeBoundIter. Note that there may be
// multiple sequence numbers for the key k@t at the storage layer, say k@t#n1,
// k@t#n2, where n1 > n2, some of which may be deleted, but the latest
// sequence number will be visible using iter (since not being visible would be
// a contradiction of the initial assumption that k@t is visible to iter).
// Since there is no delete across all sstables that deletes k@t#n1, there is
// no delete in the subset of sstables used by timeBoundIter that deletes
// k@t#n1, so the timeBoundIter will see k@t.
type MVCCIncrementalIterator struct {
        iter MVCCIterator

        // A time-bound iterator cannot be used by itself due to a bug in the time-
        // bound iterator (#28358). This was historically augmented with an iterator
        // without the time-bound optimization to act as a sanity iterator, but
        // issues remained (#43799), so now the iterator above is the main iterator
        // the timeBoundIter is used to check if any keys can be skipped by the main
        // iterator.
        timeBoundIter MVCCIterator

        startTime hlc.Timestamp
        endTime   hlc.Timestamp
        err       error
        valid     bool

        // For allocation avoidance, meta is used to store the timestamp of keys
        // regardless if they are metakeys.
        meta enginepb.MVCCMetadata

        // hasPoint and hasRange control whether the iterator should surface a point
        // or range key from the underlying iterator. If true, this implies that the
        // underlying iterator returns true as well. This can be used to hide point or
        // range keys where one key kind satisfies the time predicate but the other
        // one doesn't. Ignored following IgnoringTime() calls.
        hasPoint, hasRange bool

        // rangeKeys contains the filtered range keys at the current location.
        rangeKeys MVCCRangeKeyStack

        // rangeKeysIgnoringTime contains the complete range keys at the current location.
        rangeKeysIgnoringTime MVCCRangeKeyStack

        // rangeKeyChanged is true if i.rangeKeys changed during the previous
        // positioning operation.
        rangeKeyChanged bool

        // rangeKeyChangedIgnoringTime is true if i.rangeKeysIgnoringTime changed
        // during the previous positioning operation.
        rangeKeyChangedIgnoringTime bool

        // ignoringTime is true if the iterator is currently ignoring time bounds,
        // i.e. following a call to NextIgnoringTime().
        ignoringTime bool

        // Configuration passed in MVCCIncrementalIterOptions.
        intentPolicy MVCCIncrementalIterIntentPolicy

        // Optional collection of intents created on demand when first intent encountered.
        intents []roachpb.Intent

        // maxLockConflicts is a maximum number of conflicting locks collected before
        // returning LockConflictError. This setting only works under
        // MVCCIncrementalIterIntentPolicyAggregate. Caller must call TryGetIntentError
        // even when the collected intents is less than the threshold.
        //
        // The zero value indicates no limit.
        maxLockConflicts uint64

        // targetLockConflictBytes sets target bytes for collected intents with
        // LockConflictError. This setting will stop collecting intents when total intent
        // size exceeding the target threshold. This setting only work under
        // MVCCIncrementalIterIntentPolicyAggregate. Caller must call TryGetIntentError
        // even when the total collected intents size is less than the threshold.
        //
        // The zero value indicates no limit.
        targetLockConflictBytes uint64

        // collectedIntentBytes tracks the collected intents' memory usage, intent
        // collection could stop early if targetLockConflictBytes is reached. This
        // setting is only relevant under MVCCIncrementalIterIntentPolicyAggregate.
        collectedIntentBytes uint64
}

var _ SimpleMVCCIterator = &MVCCIncrementalIterator{}

// MVCCIncrementalIterIntentPolicy controls how the
// MVCCIncrementalIterator will handle intents that it encounters
// when iterating.
type MVCCIncrementalIterIntentPolicy int

const (
        // MVCCIncrementalIterIntentPolicyError will immediately
        // return an error for any intent found inside the given time
        // range.
        MVCCIncrementalIterIntentPolicyError MVCCIncrementalIterIntentPolicy = iota
        // MVCCIncrementalIterIntentPolicyAggregate will not fail on
        // first encountered intent, but will proceed further. All
        // found intents will be aggregated into a single
        // LockConflictError which would be updated during
        // iteration. The LockConflictError's intents size is
        // constrained by MaxLockConflicts setting. Consumer would
        // be free to decide if it wants to keep collecting entries
        // and intents or skip entries.
        MVCCIncrementalIterIntentPolicyAggregate
        // MVCCIncrementalIterIntentPolicyEmit will return intents to
        // the caller if they are inside or outside the time range.
        MVCCIncrementalIterIntentPolicyEmit
        // MVCCIncrementalIterIntentPolicyIgnore will not emit intents at all, by
        // disabling intent interleaving and filtering out any encountered intents.
        // This gives a minor performance improvement, but is only safe if the caller
        // has already checked the lock table prior to using the iterator. Otherwise,
        // any provisional values will be emitted, as they can't be disambiguated from
        // committed values.
        MVCCIncrementalIterIntentPolicyIgnore
)

// MVCCIncrementalIterOptions bundles options for NewMVCCIncrementalIterator.
type MVCCIncrementalIterOptions struct {
        KeyTypes IterKeyType
        StartKey roachpb.Key
        EndKey   roachpb.Key

        // Only keys within (StartTime,EndTime] will be emitted. EndTime defaults to
        // hlc.MaxTimestamp. The time-bound iterator optimization will only be used if
        // StartTime is set, since we assume EndTime will be near the current time.
        StartTime hlc.Timestamp
        EndTime   hlc.Timestamp

        // RangeKeyMaskingBelow will mask points keys covered by MVCC range tombstones
        // below the given timestamp. For more details, see IterOptions.
        //
        // NB: This masking also affects NextIgnoringTime(), which cannot see points
        // below MVCC range tombstones either.
        RangeKeyMaskingBelow hlc.Timestamp

        IntentPolicy MVCCIncrementalIterIntentPolicy

        // ReadCategory is used to map to a user-understandable category string, for
        // stats aggregation and metrics, and a Pebble-understandable QoS.
        ReadCategory fs.ReadCategory

        // MaxLockConflicts is a maximum number of conflicting locks collected before
        // returning LockConflictError. This setting only work under
        // MVCCIncrementalIterIntentPolicyAggregate. Caller must call TryGetIntentError
        // when the collected intents is less that the Threshold.
        //
        // The zero value indicates no limit.
        MaxLockConflicts uint64

        // TargetLockConflictBytes sets target bytes for collected intents with
        // LockConflictError. This setting will stop collecting intents when total intent
        // size exceeding the target threshold. This setting only work under
        // MVCCIncrementalIterIntentPolicyAggregate. Caller must call TryGetIntentError
        // even when the total collected intents size is less than the threshold.
        //
        // The zero value indicates no limit.
        TargetLockConflictBytes uint64
}

// NewMVCCIncrementalIterator creates an MVCCIncrementalIterator with the
// specified reader and options. The timestamp hint range should not be more
// restrictive than the start and end time range.
func NewMVCCIncrementalIterator(
        ctx context.Context, reader Reader, opts MVCCIncrementalIterOptions,
) (*MVCCIncrementalIterator, error) {
        // Default to MaxTimestamp for EndTime, since the code assumes it is set.
        if opts.EndTime.IsEmpty() {
                opts.EndTime = hlc.MaxTimestamp
        }

        // We assume EndTime is near the current time, so there is little to gain from
        // using a TBI unless StartTime is set. However, we always vary it in
        // metamorphic test builds, for better test coverage of both paths.
        useTBI := opts.StartTime.IsSet()
        if metamorphic.IsMetamorphicBuild() { // NB: always randomize when metamorphic
                useTBI = mvccIncrementalIteratorMetamorphicTBI
        }

        // Disable intent interleaving if requested.
        iterKind := MVCCKeyAndIntentsIterKind
        if opts.IntentPolicy == MVCCIncrementalIterIntentPolicyIgnore {
                iterKind = MVCCKeyIterKind
        }

        var iter MVCCIterator
        var err error
        var timeBoundIter MVCCIterator
        if useTBI {
                // An iterator without the timestamp hints is created to ensure that the
                // iterator visits every required version of every key that has changed.
                iter, err = reader.NewMVCCIterator(ctx, iterKind, IterOptions{
                        KeyTypes:             opts.KeyTypes,
                        LowerBound:           opts.StartKey,
                        UpperBound:           opts.EndKey,
                        RangeKeyMaskingBelow: opts.RangeKeyMaskingBelow,
                        ReadCategory:         opts.ReadCategory,
                })
                if err != nil {
                        return nil, err
                }
                // The timeBoundIter is only required to see versioned keys, since the
                // intents will be found by iter. It can also always enable range key
                // masking at the start time, since we never care about point keys below it
                // (the same isn't true for the main iterator, since it would break
                // NextIgnoringTime).
                tbiRangeKeyMasking := opts.RangeKeyMaskingBelow
                if tbiRangeKeyMasking.LessEq(opts.StartTime) && opts.KeyTypes == IterKeyTypePointsAndRanges {
                        tbiRangeKeyMasking = opts.StartTime.Next()
                }
                timeBoundIter, err = reader.NewMVCCIterator(ctx, MVCCKeyIterKind, IterOptions{
                        KeyTypes:   opts.KeyTypes,
                        LowerBound: opts.StartKey,
                        UpperBound: opts.EndKey,
                        // The call to startTime.Next() converts our exclusive start bound into
                        // the inclusive start bound that MinTimestampt expects.
                        MinTimestamp:         opts.StartTime.Next(),
                        MaxTimestamp:         opts.EndTime,
                        RangeKeyMaskingBelow: tbiRangeKeyMasking,
                        ReadCategory:         opts.ReadCategory,
                })
                if err != nil {
                        iter.Close()
                        return nil, err
                }
        } else {
                iter, err = reader.NewMVCCIterator(ctx, iterKind, IterOptions{
                        KeyTypes:             opts.KeyTypes,
                        LowerBound:           opts.StartKey,
                        UpperBound:           opts.EndKey,
                        RangeKeyMaskingBelow: opts.RangeKeyMaskingBelow,
                        ReadCategory:         opts.ReadCategory,
                })
                if err != nil {
                        return nil, err
                }
        }

        return &MVCCIncrementalIterator{
                iter:                    iter,
                startTime:               opts.StartTime,
                endTime:                 opts.EndTime,
                timeBoundIter:           timeBoundIter,
                intentPolicy:            opts.IntentPolicy,
                maxLockConflicts:        opts.MaxLockConflicts,
                targetLockConflictBytes: opts.TargetLockConflictBytes,
        }, nil
}

// SeekGE implements SimpleMVCCIterator.
func (i *MVCCIncrementalIterator) SeekGE(startKey MVCCKey) {
        if i.timeBoundIter != nil {
                // Check which is the first key seen by the TBI.
                i.timeBoundIter.SeekGE(startKey)
                if ok, err := i.timeBoundIter.Valid(); !ok {
                        i.err = err
                        i.valid = false
                        return
                }
                unsafeTBIKey := i.timeBoundIter.UnsafeKey().Key
                if unsafeTBIKey.Compare(startKey.Key) > 0 {
                        // If the first key that the TBI sees is ahead of the given startKey, we
                        // can seek directly to the first version of the key.
                        startKey = MakeMVCCMetadataKey(unsafeTBIKey.Clone())
                }
        }
        prevRangeKey := i.rangeKeys.Bounds.Key.Clone()
        i.iter.SeekGE(startKey)
        i.advance(true /* seeked */)
        i.rangeKeyChanged = !prevRangeKey.Equal(i.rangeKeys.Bounds.Key) // Is there a better way?
        i.rangeKeyChangedIgnoringTime = i.rangeKeyChanged
}

// Close implements SimpleMVCCIterator.
func (i *MVCCIncrementalIterator) Close() {
        i.iter.Close()
        if i.timeBoundIter != nil {
                i.timeBoundIter.Close()
        }
}

// Next implements SimpleMVCCIterator.
func (i *MVCCIncrementalIterator) Next() {
        i.iter.Next()
        i.advance(false /* seeked */)
}

// updateValid updates i.valid and i.err based on the underlying iterator, and
// returns true if valid.
// gcassert:inline
func (i *MVCCIncrementalIterator) updateValid() bool {
        i.valid, i.err = i.iter.Valid()
        return i.valid
}

// NextKey implements SimpleMVCCIterator.
func (i *MVCCIncrementalIterator) NextKey() {
        i.iter.NextKey()
        i.advance(false /* seeked */)
}

// maybeSkipKeys checks if any keys can be skipped by using a time-bound
// iterator. If keys can be skipped, it will update the main iterator to point
// to the earliest version of the next candidate key. It is expected (but not
// required) that TBI is at a key <= main iterator key when calling
// maybeSkipKeys().
//
// Returns true if any of the iter positioning operations caused the range keys
// to change.
//
// NB: This logic will not handle TBI range key filtering properly -- the TBI
// may see different range key fragmentation than the regular iterator, causing
// it to skip past range key fragments. Range key filtering has therefore been
// disabled in pebbleMVCCIterator, since the performance gains are expected to
// be marginal, and the necessary seeks/processing here would likely negate it.
// See: https://github.com/cockroachdb/cockroach/issues/86260
func (i *MVCCIncrementalIterator) maybeSkipKeys() (rangeKeyChanged bool) {
        if i.timeBoundIter == nil {
                // If there is no time bound iterator, we cannot skip any keys.
                return false
        }
        tbiKey := i.timeBoundIter.UnsafeKey().Key
        iterKey := i.iter.UnsafeKey().Key
        if iterKey.Compare(tbiKey) > 0 {
                // If the iterKey got ahead of the TBI key, advance the TBI Key.
                //
                // We fast-path the case where the main iterator is referencing the next
                // key that would be visited by the TBI. In that case, after the following
                // NextKey call, we will have iterKey == tbiKey. This means that for the
                // incremental iterator to perform a Next or NextKey will require only 1
                // extra NextKey invocation while they remain in lockstep. This case will
                // be common if most keys are modified, or the modifications are clustered
                // in keyspace, which makes the incremental iterator optimization
                // ineffective. And so in this case we want to minimize the extra cost of
                // using the incremental iterator, by avoiding a SeekGE.
                i.timeBoundIter.NextKey()
                if ok, err := i.timeBoundIter.Valid(); !ok {
                        i.valid, i.err = false, err
                        return false
                }
                tbiKey = i.timeBoundIter.UnsafeKey().Key

                cmp := iterKey.Compare(tbiKey)

                if cmp > 0 {
                        // If the tbiKey is still behind the iterKey, the TBI key may be seeing
                        // phantom MVCCKey.Keys. These keys may not be seen by the main iterator
                        // due to aborted transactions and keys which have been subsumed due to
                        // range tombstones. In this case we can SeekGE() the TBI to the main iterator.
                        seekKey := MakeMVCCMetadataKey(iterKey)
                        i.timeBoundIter.SeekGE(seekKey)
                        if ok, err := i.timeBoundIter.Valid(); !ok {
                                i.valid, i.err = false, err
                                return false
                        }
                        tbiKey = i.timeBoundIter.UnsafeKey().Key

                        // If there is an MVCC range key across iterKey, then the TBI seek may get
                        // stuck in the middle of the bare range key so we step forward.
                        if hasPoint, hasRange := i.timeBoundIter.HasPointAndRange(); hasRange && !hasPoint {
                                if !i.timeBoundIter.RangeBounds().Key.Equal(tbiKey) {
                                        i.timeBoundIter.Next()
                                        if ok, err := i.timeBoundIter.Valid(); !ok {
                                                i.valid, i.err = false, err
                                                return false
                                        }
                                        tbiKey = i.timeBoundIter.UnsafeKey().Key
                                }
                        }
                        cmp = iterKey.Compare(tbiKey)
                }

                if cmp < 0 {
                        // In the case that the next MVCC key that the TBI observes is not the
                        // same as the main iterator, we may be able to skip over a large group
                        // of keys. The main iterator is seeked to the TBI in hopes that many
                        // keys were skipped. Note that a Seek is an order of magnitude more
                        // expensive than a Next call, but the engine has low-level
                        // optimizations that attempt to make it cheaper if the seeked key is
                        // "nearby" (within the same sstable block).
                        seekKey := MakeMVCCMetadataKey(tbiKey)
                        i.iter.SeekGE(seekKey)
                        if !i.updateValid() {
                                return false
                        }
                        rangeKeyChanged := i.iter.RangeKeyChanged()

                        // The seek may have landed in the middle of a bare range key, in which
                        // case we should move on to the next key.
                        if hasPoint, hasRange := i.iter.HasPointAndRange(); hasRange && !hasPoint {
                                if !i.iter.RangeBounds().Key.Equal(i.iter.UnsafeKey().Key) {
                                        i.iter.Next()
                                        if !i.updateValid() {
                                                return false
                                        }
                                        rangeKeyChanged = rangeKeyChanged || i.iter.RangeKeyChanged()
                                }
                        }
                        return rangeKeyChanged
                }
        }
        return false
}

// updateMeta initializes i.meta. It sets i.err and returns an error on any
// errors, e.g. if it encounters an intent in the time span (startTime, endTime]
// or an inline value.
func (i *MVCCIncrementalIterator) updateMeta() error {
        unsafeKey := i.iter.UnsafeKey()
        if unsafeKey.IsValue() {
                // The key is an MVCC value and not an intent or inline.
                i.meta.Reset()
                i.meta.Timestamp = unsafeKey.Timestamp.ToLegacyTimestamp()
                return nil
        }

        // The key is a metakey (an intent or inline meta). If an inline meta, we
        // will handle below. If an intent meta, then this is used later to see if
        // the timestamp of this intent is within the incremental iterator's time
        // bounds.
        var v []byte
        v, i.err = i.iter.UnsafeValue()
        if i.err != nil {
                i.valid = false
                return i.err
        }
        if i.err = protoutil.Unmarshal(v, &i.meta); i.err != nil {
                i.valid = false
                return i.err
        }

        if i.meta.IsInline() {
                i.valid = false
                i.err = errors.Errorf("unexpected inline value found: %s", unsafeKey.Key)
                return i.err
        }

        if i.meta.Txn == nil {
                i.valid = false
                i.err = errors.Errorf("intent is missing a txn: %s", unsafeKey.Key)
        }

        metaTimestamp := i.meta.Timestamp.ToTimestamp()
        if i.startTime.Less(metaTimestamp) && metaTimestamp.LessEq(i.endTime) {
                switch i.intentPolicy {
                case MVCCIncrementalIterIntentPolicyError:
                        i.err = &kvpb.LockConflictError{
                                Locks: []roachpb.Lock{
                                        roachpb.MakeIntent(i.meta.Txn, i.iter.UnsafeKey().Key.Clone()).AsLock(),
                                },
                        }
                        i.valid = false
                        return i.err
                case MVCCIncrementalIterIntentPolicyAggregate:
                        // We are collecting intents, so we need to save it and advance to its proposed value. Caller could then use a
                        // value key to update proposed row counters for the sake of bookkeeping and advance more.
                        intent := roachpb.MakeIntent(i.meta.Txn, i.iter.UnsafeKey().Key.Clone())
                        i.intents = append(i.intents, intent)
                        i.collectedIntentBytes += uint64(intent.Size())
                        if i.targetLockConflictBytes > 0 && i.collectedIntentBytes >= i.targetLockConflictBytes {
                                i.valid = false
                                i.err = i.TryGetIntentError()
                                return i.err
                        }
                        if i.maxLockConflicts > 0 && uint64(len(i.intents)) >= i.maxLockConflicts {
                                i.valid = false
                                i.err = i.TryGetIntentError()
                                return i.err
                        }
                        return nil
                case MVCCIncrementalIterIntentPolicyEmit:
                        // We will emit this intent to the caller.
                        return nil
                case MVCCIncrementalIterIntentPolicyIgnore:
                        // We don't expect to see this since we disabled intent interleaving.
                        i.err = errors.AssertionFailedf("unexpected intent (interleaving disabled): %s", &i.meta)
                        i.valid = false
                        return i.err
                default:
                        i.err = errors.AssertionFailedf("unknown intent policy: %d", i.intentPolicy)
                        i.valid = false
                        return i.err
                }
        }
        return nil
}

// updateRangeKeys updates the iterator with the current range keys, filtered by
// time span, and returns whether the position has point and/or range keys.
func (i *MVCCIncrementalIterator) updateRangeKeys() (bool, bool) {
        hasPoint, hasRange := i.iter.HasPointAndRange()
        if hasRange {
                // Clone full set of range keys into i.rangeKeysIgnoringTime.
                rangeKeys := i.iter.RangeKeys()
                rangeKeys.CloneInto(&i.rangeKeysIgnoringTime)

                // Keep trimmed subset in i.rangeKeys.
                i.rangeKeys = i.rangeKeysIgnoringTime
                i.rangeKeys.Trim(i.startTime.Next(), i.endTime)
                if i.rangeKeys.IsEmpty() {
                        i.rangeKeys.Clear()
                        hasRange = false
                }
        } else {
                i.rangeKeys.Clear()
                i.rangeKeysIgnoringTime.Clear()
        }
        return hasPoint, hasRange
}

// advance advances the main iterator until it is referencing a key within
// (start_time, end_time]. If seeked is true, the caller is a SeekGE operation,
// in which case we should emit the current range key position even if
// RangeKeyChanged() doesn't trigger.
//
// It populates i.err with an error if it encountered an inline value or an
// intent with a timestamp within the incremental iterator's bounds when the
// intent policy is MVCCIncrementalIterIntentPolicyError.
func (i *MVCCIncrementalIterator) advance(seeked bool) {
        i.ignoringTime = false
        i.rangeKeyChanged, i.rangeKeyChangedIgnoringTime = false, false
        hadRange, hadRangeIgnoringTime := !i.rangeKeys.IsEmpty(), !i.rangeKeysIgnoringTime.IsEmpty()
        for {
                if !i.updateValid() {
                        return
                }

                // If the caller was a SeekGE operation, process the initial range key (if
                // any) even if RangeKeyChanged() does not fire.
                rangeKeyChanged := seeked || i.iter.RangeKeyChanged()
                seeked = false

                if i.maybeSkipKeys() {
                        rangeKeyChanged = true
                }
                if !i.valid {
                        return
                }

                // Process range keys.
                var newRangeKey bool
                if rangeKeyChanged {
                        i.hasPoint, i.hasRange = i.updateRangeKeys()
                        newRangeKey = i.hasRange

                        // NB: !hasRange → !hasRange is not a change.
                        i.rangeKeyChanged = hadRange || i.hasRange
                        i.rangeKeyChangedIgnoringTime = hadRangeIgnoringTime || !i.rangeKeysIgnoringTime.IsEmpty()

                        // If we're on a visible, bare range key then we're done. If the range key
                        // was filtered out by the time bounds (the !hasPoint && !hasRange case),
                        // then we move on to the next key.
                        if !i.hasPoint {
                                if !i.hasRange {
                                        i.iter.Next()
                                        continue
                                }
                                i.meta.Reset()
                                return
                        }

                } else if !i.hasPoint {
                        // If the range key didn't change, and this wasn't a seek, then we must be
                        // on a point key since the iterator won't surface anything else.
                        i.hasPoint = true
                }

                // Process point keys.
                if err := i.updateMeta(); err != nil {
                        return
                }

                // INVARIANT: we have an intent or an MVCC value.

                if i.meta.Txn != nil {
                        switch i.intentPolicy {
                        case MVCCIncrementalIterIntentPolicyEmit:
                                // If our policy is emit, we may want this
                                // intent. If it is outside our time bounds, it
                                // will be filtered below.
                        case MVCCIncrementalIterIntentPolicyError,
                                MVCCIncrementalIterIntentPolicyAggregate,
                                MVCCIncrementalIterIntentPolicyIgnore:
                                // We have encountered an intent but it must lie outside the timestamp
                                // span (startTime, endTime], or have been aggregated or ignored. In
                                // either case, we want to advance past it, unless we're also on a new
                                // range key that must be emitted.
                                if newRangeKey {
                                        i.hasPoint = false
                                        return
                                }
                                i.iter.Next()
                                continue
                        }
                }

                // Note that MVCC keys are sorted by key, then by _descending_ timestamp
                // order with the exception of the metakey (timestamp 0) being sorted
                // first.
                //
                // If we encountered a new range key on this position, then we must emit it
                // even if the the point key should be skipped. This typically happens on a
                // filtered intent or when seeking directly to a filtered point version.
                metaTimestamp := i.meta.Timestamp.ToTimestamp()
                if newRangeKey {
                        i.hasPoint = i.startTime.Less(metaTimestamp) && metaTimestamp.LessEq(i.endTime)
                        return
                } else if i.endTime.Less(metaTimestamp) {
                        i.iter.Next()
                } else if metaTimestamp.LessEq(i.startTime) {
                        i.iter.NextKey()
                } else {
                        // The current key is a valid user key and within the time bounds. We are
                        // done.
                        break
                }
        }
}

// Valid implements SimpleMVCCIterator.
func (i *MVCCIncrementalIterator) Valid() (bool, error) {
        if util.RaceEnabled && i.valid {
                if err := i.assertInvariants(); err != nil {
                        return false, err
                }
        }
        return i.valid, i.err
}

// UnsafeKey implements SimpleMVCCIterator.
func (i *MVCCIncrementalIterator) UnsafeKey() MVCCKey {
        return i.iter.UnsafeKey()
}

// HasPointAndRange implements SimpleMVCCIterator.
//
// This only returns hasRange=true if there are filtered range keys present.
// Thus, it is possible for this to return hasPoint=false,hasRange=false
// following a NextIgnoringTime() call if positioned on a bare, filtered
// range key. In this case, the range keys are available via
// RangeKeysIgnoringTime().
func (i *MVCCIncrementalIterator) HasPointAndRange() (bool, bool) {
        return i.hasPoint, i.hasRange
}

// RangeBounds implements SimpleMVCCIterator.
//
// This only returns the filtered range key bounds. Thus, if a
// NextIgnoringTime() call moves onto an otherwise hidden range key, this will
// still return an empty span. These hidden range keys are available via
// RangeKeysIgnoringTime().
func (i *MVCCIncrementalIterator) RangeBounds() roachpb.Span {
        return i.rangeKeys.Bounds
}

// RangeKeys implements SimpleMVCCIterator.
func (i *MVCCIncrementalIterator) RangeKeys() MVCCRangeKeyStack {
        return i.rangeKeys
}

// RangeKeysIgnoringTime returns the range keys at the current position,
// ignoring time bounds. This call is cheap, so callers do not need to perform
// their own caching.
func (i *MVCCIncrementalIterator) RangeKeysIgnoringTime() MVCCRangeKeyStack {
        return i.rangeKeysIgnoringTime
}

// RangeKeyChanged implements SimpleMVCCIterator.
//
// RangeKeyChanged only applies to the filtered set of range keys. If an
// IgnoringTime() operation reveals additional range keys or versions, these do
// not trigger RangeKeyChanged(). See also RangeKeyChangedIgnoringTime().
func (i *MVCCIncrementalIterator) RangeKeyChanged() bool {
        return i.rangeKeyChanged
}

// RangeKeyChangedIgnoringTime is like RangeKeyChanged, but returns true if the
// range keys returned by RangeKeysIgnoringTime() changed since the previous
// positioning operation -- in particular, after a Next(Key)IgnoringTime() call.
func (i *MVCCIncrementalIterator) RangeKeyChangedIgnoringTime() bool {
        return i.rangeKeyChangedIgnoringTime
}

// UnsafeValue implements SimpleMVCCIterator.
func (i *MVCCIncrementalIterator) UnsafeValue() ([]byte, error) {
        if !i.hasPoint {
                return nil, nil
        }
        return i.iter.UnsafeValue()
}

// MVCCValueLenAndIsTombstone implements the SimpleMVCCIterator interface.
func (i *MVCCIncrementalIterator) MVCCValueLenAndIsTombstone() (int, bool, error) {
        return i.iter.MVCCValueLenAndIsTombstone()
}

// ValueLen implements the SimpleMVCCIterator interface.
func (i *MVCCIncrementalIterator) ValueLen() int {
        return i.iter.ValueLen()
}

// updateIgnoreTime updates the iterator's metadata and handles intents depending on the iterator's
// intent policy.
func (i *MVCCIncrementalIterator) updateIgnoreTime() {
        i.ignoringTime = true
        i.rangeKeyChanged, i.rangeKeyChangedIgnoringTime = false, false
        hadRange := !i.rangeKeys.IsEmpty()
        for {
                if !i.updateValid() {
                        return
                }

                if i.iter.RangeKeyChanged() {
                        i.hasPoint, i.hasRange = i.updateRangeKeys()
                        i.rangeKeyChanged = hadRange || i.hasRange // !hasRange → !hasRange is no change
                        i.rangeKeyChangedIgnoringTime = true
                        if !i.hasPoint {
                                i.meta.Reset()
                                return
                        }
                } else if !i.hasPoint {
                        i.hasPoint = true
                }

                if err := i.updateMeta(); err != nil {
                        return
                }

                // We have encountered an intent but it does not lie in the timestamp span
                // (startTime, endTime] so we do not throw an error, and attempt to move to
                // the intent's corresponding provisional value.
                //
                // Note: it's important to surface the intent's provisional value as callers rely on observing
                // any value -- provisional, or not -- to make decisions. MVCClearTimeRange, for example,
                // flushes keys for deletion whenever it encounters a key outside (StartTime,EndTime].
                //
                // TODO(msbulter): investigate if it's clearer for the caller to emit the intent in
                // addition to the provisional value.
                if i.meta.Txn != nil && i.intentPolicy != MVCCIncrementalIterIntentPolicyEmit {
                        i.iter.Next()
                        continue
                }

                // We have a valid KV or an intent to emit.
                return
        }
}

// NextIgnoringTime returns the next key/value that would be encountered in a
// non-incremental iteration by moving the underlying non-TBI iterator forward.
// Intents within and outside the (StartTime, EndTime] time range are handled
// according to the iterator policy.
//
// NB: Range key methods only respect the filtered set of range keys. To access
// unfiltered range keys, use RangeKeysIgnoringTime(). This implies that if this
// call steps onto a range key that's entirely outside of the time bounds:
//
// * HasPointAndRange() will return false,false if on a bare range key.
//
//   - RangeKeyChanged() will not fire, unless stepping off of a range key
//     within the time bounds.
//
// * RangeBounds() and RangeKeys() will return empty results.
func (i *MVCCIncrementalIterator) NextIgnoringTime() {
        i.iter.Next()
        i.updateIgnoreTime()
}

// NextKeyIgnoringTime returns the next distinct key that would be encountered
// in a non-incremental iteration by moving the underlying non-TBI iterator
// forward. Intents within and outside the (StartTime, EndTime] time range are
// handled according to the iterator policy.
//
// NB: See NextIgnoringTime comment for important details about range keys.
func (i *MVCCIncrementalIterator) NextKeyIgnoringTime() {
        i.iter.NextKey()
        i.updateIgnoreTime()
}

// IgnoringTime returns true if the previous positioning operation ignored time
// bounds.
func (i *MVCCIncrementalIterator) IgnoringTime() bool {
        return i.ignoringTime
}

// TryGetIntentError returns kvpb.LockConflictError if intents were encountered
// during iteration and intent aggregation is enabled. Otherwise function
// returns nil. kvpb.LockConflictError will contain encountered intents, the
// collected intents are bounded by maxLockConflict or targetBytes constraint.
// TODO(nvanbenschoten): rename to TryGetLockConflictError.
func (i *MVCCIncrementalIterator) TryGetIntentError() error {
        if len(i.intents) == 0 {
                return nil
        }
        return &kvpb.LockConflictError{
                Locks: roachpb.AsLocks(i.intents),
        }
}

// Stats returns statistics about the iterator.
func (i *MVCCIncrementalIterator) Stats() IteratorStats {
        stats := i.iter.Stats()
        if i.timeBoundIter != nil {
                tbStats := i.timeBoundIter.Stats()
                stats.Stats.Merge(tbStats.Stats)
        }
        return stats
}

// assertInvariants asserts iterator invariants. The iterator must be valid.
func (i *MVCCIncrementalIterator) assertInvariants() error {
        // Check general SimpleMVCCIterator API invariants.
        if err := assertSimpleMVCCIteratorInvariants(i); err != nil {
                return err
        }

        // The underlying iterator must be valid when the MVCCIncrementalIterator is.
        if ok, err := i.iter.Valid(); err != nil || !ok {
                errMsg := err.Error()
                return errors.AssertionFailedf("i.iter is invalid with err=%s", errMsg)
        }

        iterKey := i.iter.UnsafeKey()

        // endTime must be set, and be at or after startTime.
        if i.endTime.IsEmpty() {
                return errors.AssertionFailedf("i.endTime not set")
        }
        if i.endTime.Less(i.startTime) {
                return errors.AssertionFailedf("i.endTime %s before i.startTime %s", i.endTime, i.startTime)
        }

        // If startTime is empty, the TBI should be disabled in non-metamorphic builds.
        if !metamorphic.IsMetamorphicBuild() && i.startTime.IsEmpty() && i.timeBoundIter != nil {
                return errors.AssertionFailedf("TBI enabled without i.startTime")
        }

        // If the TBI is enabled, its position should be <= iter unless iter is on an intent.
        if i.timeBoundIter != nil && iterKey.Timestamp.IsSet() {
                if ok, _ := i.timeBoundIter.Valid(); ok {
                        if tbiKey := i.timeBoundIter.UnsafeKey(); tbiKey.Compare(iterKey) > 0 {
                                return errors.AssertionFailedf("TBI at %q ahead of i.iter at %q", tbiKey, iterKey)
                        }
                }
        }

        // i.meta should match the underlying iterator's key.
        if hasPoint, _ := i.iter.HasPointAndRange(); hasPoint {
                metaTS := i.meta.Timestamp.ToTimestamp()
                if iterKey.Timestamp.IsSet() && metaTS != iterKey.Timestamp {
                        return errors.AssertionFailedf("i.meta.Timestamp %s differs from i.iter.UnsafeKey %s",
                                metaTS, iterKey)
                }
                if metaTS.IsEmpty() && i.meta.Txn == nil {
                        return errors.AssertionFailedf("empty i.meta for point key %s", iterKey)
                }
        } else {
                if i.meta.Timestamp.ToTimestamp().IsSet() || i.meta.Txn != nil {
                        return errors.AssertionFailedf("i.iter hasPoint=false but non-empty i.meta %+v", i.meta)
                }
        }

        // Unlike most SimpleMVCCIterators, it's possible to return
        // hasPoint=false,hasRange=false following a NextIgnoringTime() call.
        hasPoint, hasRange := i.HasPointAndRange()
        if !hasPoint && !hasRange {
                if !i.ignoringTime {
                        return errors.AssertionFailedf(
                                "hasPoint=false,hasRange=false invalid when i.ignoringTime=false")
                }
                if i.RangeKeysIgnoringTime().IsEmpty() {
                        return errors.AssertionFailedf(
                                "hasPoint=false,hasRange=false and RangeKeysIgnoringTime() returned nothing")
                }
        }

        // Point keys and range keys must be within the time bounds, unless
        // we're ignoring time bounds.
        assertInRange := func(ts hlc.Timestamp, format string, args ...interface{}) error {
                if i.startTime.IsSet() && ts.LessEq(i.startTime) || i.endTime.Less(ts) {
                        return errors.AssertionFailedf("%s not in range (%s-%s]",
                                fmt.Sprintf(format, args...), i.startTime, i.endTime)
                }
                return nil
        }
        key := i.UnsafeKey()

        if hasPoint && !i.ignoringTime {
                if key.Timestamp.IsEmpty() {
                        intent := key.Clone()
                        intent.Timestamp = i.meta.Timestamp.ToTimestamp()
                        if err := assertInRange(intent.Timestamp, "intent %s", intent); err != nil {
                                return err
                        }
                } else {
                        if err := assertInRange(key.Timestamp, "point key %s", key); err != nil {
                                return err
                        }
                }
        }
        if hasRange {
                rangeKeys := i.RangeKeys()
                for _, v := range rangeKeys.Versions {
                        if err := assertInRange(v.Timestamp, "range key %s", rangeKeys.AsRangeKey(v)); err != nil {
                                return err
                        }
                }
        }

        // Check that intents are processed according to intentPolicy.
        if hasPoint && key.Timestamp.IsEmpty() && i.intentPolicy != MVCCIncrementalIterIntentPolicyEmit {
                return errors.AssertionFailedf("emitted intent %s not allowed by i.intentPolicy %v",
                        key, i.intentPolicy)
        }
        if len(i.intents) > 0 && i.intentPolicy != MVCCIncrementalIterIntentPolicyAggregate {
                return errors.AssertionFailedf("i.intents set but not allowed by i.intentPolicy %v",
                        i.intentPolicy)
        }
        for _, intent := range i.intents {
                intentKey := MVCCKey{Key: intent.Key, Timestamp: intent.Txn.WriteTimestamp}
                if err := assertInRange(intentKey.Timestamp, "gathered intent %s", intentKey); err != nil {
                        return err
                }
        }

        // RangeKeys() must be a subset of RangeKeysIgnoringTime().
        if hasRange {
                rangeKeys := i.RangeKeys()
                rangeKeysIgnoringTime := i.RangeKeysIgnoringTime()
                if !rangeKeys.Bounds.Equal(rangeKeysIgnoringTime.Bounds) {
                        return errors.AssertionFailedf("RangeKeys=%s does not match RangeKeysIgnoringTime=%s",
                                rangeKeys.Bounds, rangeKeysIgnoringTime.Bounds)
                }
                trimmedVersions := rangeKeysIgnoringTime.Versions
                trimmedVersions.Trim(rangeKeys.Oldest(), rangeKeys.Newest())
                if !rangeKeys.Versions.Equal(trimmedVersions) {
                        return errors.AssertionFailedf("RangeKeys=%s not subset of RangeKeysIgnoringTime=%s",
                                rangeKeys, rangeKeysIgnoringTime)
                }

        } else {
                // RangeKeysIgnoringTime must cover the current iterator position.
                if rangeKeys := i.RangeKeysIgnoringTime(); !rangeKeys.IsEmpty() {
                        if !rangeKeys.Bounds.ContainsKey(key.Key) {
                                return errors.AssertionFailedf("RangeKeysIgnoringTime %s does not cover position %s",
                                        rangeKeys.Bounds, key)
                        }
                }
        }

        return nil
}

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "encoding/binary"
        "fmt"
        "slices"
        "sort"
        "strings"

        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/errors"
)

var (
        // MVCCKeyMax sorts after all other MVCC keys.
        MVCCKeyMax = MakeMVCCMetadataKey(roachpb.KeyMax)
        // NilKey is the nil MVCCKey.
        NilKey = MVCCKey{}
)

const (
        mvccEncodedTimeSentinelLen  = 1
        mvccEncodedTimeWallLen      = 8
        mvccEncodedTimeLogicalLen   = 4
        mvccEncodedTimeSyntheticLen = 1
        mvccEncodedTimeLengthLen    = 1
)

// MVCCKey is a versioned key, distinguished from roachpb.Key with the addition
// of a "version" timestamp.
//
// The version timestamp dictates the key's visibility to readers. Readers with
// read timestamps equal to or greater than the version timestamp observe the
// key. Readers with read timestamps below the version timestamp ignore the key.
// Keys are stored in decreasing version order, with the exception of version
// zero (timestamp 0), which is referred to as a "meta" version and is stored
// before all other versions of the same key.
type MVCCKey struct {
        Key       roachpb.Key
        Timestamp hlc.Timestamp
}

// MakeMVCCMetadataKey creates an MVCCKey from a roachpb.Key.
func MakeMVCCMetadataKey(key roachpb.Key) MVCCKey {
        return MVCCKey{Key: key}
}

// Next returns the next key.
func (k MVCCKey) Next() MVCCKey {
        ts := k.Timestamp.Prev()
        if ts.IsEmpty() {
                return MVCCKey{
                        Key: k.Key.Next(),
                }
        }
        return MVCCKey{
                Key:       k.Key,
                Timestamp: ts,
        }
}

// Clone returns a copy of the key.
func (k MVCCKey) Clone() MVCCKey {
        k.Key = k.Key.Clone()
        return k
}

// CloneInto copies the key into the provided destination MVCCKey, reusing and
// overwriting its key slice.
func (k MVCCKey) CloneInto(dst *MVCCKey) {
        dst.Key = append(dst.Key[:0], k.Key...)
        dst.Timestamp = k.Timestamp
}

// Compare returns -1 if this key is less than the given key, 0 if they're
// equal, or 1 if this is greater. Comparison is by key,timestamp, where larger
// timestamps sort before smaller ones except empty ones which sort first (like
// elsewhere in MVCC).
func (k MVCCKey) Compare(o MVCCKey) int {
        if c := k.Key.Compare(o.Key); c != 0 {
                return c
        }
        if k.Timestamp.IsEmpty() && !o.Timestamp.IsEmpty() {
                return -1
        } else if !k.Timestamp.IsEmpty() && o.Timestamp.IsEmpty() {
                return 1
        } else {
                return -k.Timestamp.Compare(o.Timestamp) // timestamps sort in reverse
        }
}

// Less compares two keys.
func (k MVCCKey) Less(l MVCCKey) bool {
        if c := k.Key.Compare(l.Key); c != 0 {
                return c < 0
        }
        if !k.IsValue() {
                return l.IsValue()
        } else if !l.IsValue() {
                return false
        }
        return l.Timestamp.Less(k.Timestamp)
}

// Equal returns whether two keys are identical.
func (k MVCCKey) Equal(l MVCCKey) bool {
        return k.Key.Compare(l.Key) == 0 && k.Timestamp == l.Timestamp
}

// IsValue returns true iff the timestamp is non-zero.
func (k MVCCKey) IsValue() bool {
        return !k.Timestamp.IsEmpty()
}

// EncodedSize returns the size of the MVCCKey when encoded.
//
// TODO(itsbilal): Reconcile this with Len(). Would require updating MVCC stats
// tests to reflect the more accurate lengths provided by Len().
// TODO(nvanbenschoten): Change the return value to an int64. That's what every
// caller wants.
func (k MVCCKey) EncodedSize() int {
        n := len(k.Key) + 1
        if k.IsValue() {
                // Note that this isn't quite accurate: timestamps consume between 8-13
                // bytes. Fixing this only adjusts the accounting for timestamps, not the
                // actual on disk storage.
                n += int(MVCCVersionTimestampSize)
        }
        return n
}

// String returns a string-formatted version of the key.
func (k MVCCKey) String() string {
        if !k.IsValue() {
                return k.Key.String()
        }
        return fmt.Sprintf("%s/%s", k.Key, k.Timestamp)
}

// Format implements the fmt.Formatter interface.
func (k MVCCKey) Format(f fmt.State, c rune) {
        fmt.Fprintf(f, "%s/%s", k.Key, k.Timestamp)
}

// Len returns the size of the MVCCKey when encoded. Implements the
// pebble.Encodeable interface.
func (k MVCCKey) Len() int {
        return encodedMVCCKeyLength(k)
}

// EncodeMVCCKey encodes an MVCCKey into its Pebble representation. The encoding
// takes the following forms, where trailing time components are omitted when
// zero-valued:
//
// [key] [sentinel] [timeWall] [timeLogical] [timeSynthetic] [timeLength]
// [key] [sentinel] [timeWall] [timeLogical] [timeLength]
// [key] [sentinel] [timeWall] [timeLength]
// [key] [sentinel]
//
// key:           the unmodified binary key            (variable length)
// sentinel:      separates key and timestamp          (1 byte: 0x00)
// timeWall:      Timestamp.WallTime                   (8 bytes: big-endian uint64)
// timeLogical:   Timestamp.Logical                    (4 bytes: big-endian uint32)
// timeSynthetic: Timestamp.Synthetic                  (1 byte: 0x01 when set)
// timeLength:    encoded timestamp length inc. itself (1 byte: uint8)
//
// The sentinel byte can be used to detect a key without a timestamp, since
// timeLength will never be 0 (it includes itself in the length).
//
// The timeSynthetic form is no longer written by the current version of the
// code, but can be encountered in the wild until we migrate it away. Until
// then, decoding routines must be prepared to handle it, but can ignore the
// synthetic bit.
func EncodeMVCCKey(key MVCCKey) []byte {
        keyLen := encodedMVCCKeyLength(key)
        buf := make([]byte, keyLen)
        encodeMVCCKeyToBuf(buf, key, keyLen)
        return buf
}

// EncodeMVCCKeyToBuf encodes an MVCCKey into its Pebble representation, reusing
// the given byte buffer if it has sufficient capacity.
func EncodeMVCCKeyToBuf(buf []byte, key MVCCKey) []byte {
        keyLen := encodedMVCCKeyLength(key)
        if cap(buf) < keyLen {
                buf = make([]byte, keyLen)
        } else {
                buf = buf[:keyLen]
        }
        encodeMVCCKeyToBuf(buf, key, keyLen)
        return buf
}

// EncodeMVCCKeyPrefix encodes an MVCC user key (without timestamp) into its
// Pebble prefix representation.
func EncodeMVCCKeyPrefix(key roachpb.Key) []byte {
        return EncodeMVCCKey(MVCCKey{Key: key})
}

// encodeMVCCKeyToBuf encodes an MVCCKey into its Pebble representation to the
// target buffer, which must have the correct size.
func encodeMVCCKeyToBuf(buf []byte, key MVCCKey, keyLen int) {
        copy(buf, key.Key)
        pos := len(key.Key)

        buf[pos] = 0 // sentinel byte
        pos += mvccEncodedTimeSentinelLen

        tsLen := keyLen - pos - mvccEncodedTimeLengthLen
        if tsLen > 0 {
                encodeMVCCTimestampToBuf(buf[pos:], key.Timestamp)
                pos += tsLen
                buf[pos] = byte(tsLen + mvccEncodedTimeLengthLen)
        }
}

// encodeMVCCTimestamp encodes an MVCC timestamp into its Pebble
// representation, excluding length suffix and sentinel byte.
func encodeMVCCTimestamp(ts hlc.Timestamp) []byte {
        tsLen := encodedMVCCTimestampLength(ts)
        if tsLen == 0 {
                return nil
        }
        buf := make([]byte, tsLen)
        encodeMVCCTimestampToBuf(buf, ts)
        return buf
}

// EncodeMVCCTimestampSuffix encodes an MVCC timestamp into its Pebble
// representation, including the length suffix but excluding the sentinel byte.
// This is equivalent to the Pebble suffix.
func EncodeMVCCTimestampSuffix(ts hlc.Timestamp) []byte {
        return encodeMVCCTimestampSuffixToBuf(nil, ts)
}

// encodeMVCCTimestampSuffixToBuf encodes an MVCC timestamp into its Pebble
// representation, including the length suffix but excluding the sentinel byte.
// This is equivalent to the Pebble suffix. It reuses the given byte buffer if
// it has sufficient capacity.
func encodeMVCCTimestampSuffixToBuf(buf []byte, ts hlc.Timestamp) []byte {
        tsLen := encodedMVCCTimestampLength(ts)
        if tsLen == 0 {
                return buf[:0]
        }
        suffixLen := tsLen + mvccEncodedTimeLengthLen
        if cap(buf) < suffixLen {
                buf = make([]byte, suffixLen)
        } else {
                buf = buf[:suffixLen]
        }
        encodeMVCCTimestampToBuf(buf, ts)
        buf[tsLen] = byte(suffixLen)
        return buf
}

// EncodeMVCCTimestampToBuf encodes an MVCC timestamp into its Pebble
// representation, excluding the length suffix and sentinel byte, reusing the
// given byte slice if it has sufficient capacity.
func EncodeMVCCTimestampToBuf(buf []byte, ts hlc.Timestamp) []byte {
        tsLen := encodedMVCCTimestampLength(ts)
        if tsLen == 0 {
                return buf[:0]
        }
        if cap(buf) < tsLen {
                buf = make([]byte, tsLen)
        } else {
                buf = buf[:tsLen]
        }
        encodeMVCCTimestampToBuf(buf, ts)
        return buf
}

// encodeMVCCTimestampToBuf encodes an MVCC timestamp into its Pebble
// representation, excluding the length suffix and sentinel byte. The target
// buffer must have the correct size, and the timestamp must not be empty.
func encodeMVCCTimestampToBuf(buf []byte, ts hlc.Timestamp) {
        binary.BigEndian.PutUint64(buf, uint64(ts.WallTime))
        if ts.Logical != 0 {
                binary.BigEndian.PutUint32(buf[mvccEncodedTimeWallLen:], uint32(ts.Logical))
        }
}

// encodedMVCCKeyLength returns the encoded length of the given MVCCKey.
func encodedMVCCKeyLength(key MVCCKey) int {
        // NB: We don't call into EncodedMVCCKeyPrefixLength() or
        // EncodedMVCCTimestampSuffixLength() here because the additional function
        // call overhead is significant.
        keyLen := len(key.Key) + mvccEncodedTimeSentinelLen
        if !key.Timestamp.IsEmpty() {
                keyLen += mvccEncodedTimeWallLen + mvccEncodedTimeLengthLen
                if key.Timestamp.Logical != 0 {
                        keyLen += mvccEncodedTimeLogicalLen
                }
        }
        return keyLen
}

// EncodedMVCCKeyPrefixLength returns the encoded length of a roachpb.Key prefix
// including the sentinel byte.
func EncodedMVCCKeyPrefixLength(key roachpb.Key) int {
        return len(key) + mvccEncodedTimeSentinelLen
}

// encodedMVCCTimestampLength returns the encoded length of the given MVCC
// timestamp, excluding the length suffix and sentinel bytes.
func encodedMVCCTimestampLength(ts hlc.Timestamp) int {
        // This is backwards, but encodedMVCCKeyLength() is called in the
        // EncodeMVCCKey() hot path and an additional function call to this function
        // shows ~6% overhead in benchmarks. We therefore do the timestamp length
        // calculation inline in encodedMVCCKeyLength(), and remove the excess here.
        tsLen := encodedMVCCKeyLength(MVCCKey{Timestamp: ts}) - mvccEncodedTimeSentinelLen
        if tsLen > 0 {
                tsLen -= mvccEncodedTimeLengthLen
        }
        return tsLen
}

// EncodedMVCCTimestampSuffixLength returns the encoded length of the
// given MVCC timestamp, including the length suffix. It returns 0
// if the timestamp is empty.
func EncodedMVCCTimestampSuffixLength(ts hlc.Timestamp) int {
        // This is backwards, see comment in encodedMVCCTimestampLength() for why.
        return encodedMVCCKeyLength(MVCCKey{Timestamp: ts}) - mvccEncodedTimeSentinelLen
}

// TODO(erikgrinaker): merge in the enginepb decoding functions once it can
// avoid the storage package's problematic CGo dependency (via Pebble).

// DecodeMVCCKey decodes an MVCCKey from its Pebble representation.
func DecodeMVCCKey(encodedKey []byte) (MVCCKey, error) {
        k, ts, err := enginepb.DecodeKey(encodedKey)
        return MVCCKey{k, ts}, err
}

// decodeMVCCTimestamp decodes an MVCC timestamp from its Pebble representation,
// excluding the length suffix.
func decodeMVCCTimestamp(encodedTS []byte) (hlc.Timestamp, error) {
        // NB: This logic is duplicated in enginepb.DecodeKey() to avoid the
        // overhead of an additional function call there (~13%).
        var ts hlc.Timestamp
        switch len(encodedTS) {
        case 0:
                // No-op.
        case 8:
                ts.WallTime = int64(binary.BigEndian.Uint64(encodedTS[0:8]))
        case 12, 13:
                ts.WallTime = int64(binary.BigEndian.Uint64(encodedTS[0:8]))
                ts.Logical = int32(binary.BigEndian.Uint32(encodedTS[8:12]))
                // NOTE: byte 13 used to store the timestamp's synthetic bit, but this is no
                // longer consulted and can be ignored during decoding.
        default:
                return hlc.Timestamp{}, errors.Errorf("bad timestamp %x", encodedTS)
        }
        return ts, nil
}

// DecodeMVCCTimestampSuffix decodes an MVCC timestamp from its Pebble representation,
// including the length suffix.
func DecodeMVCCTimestampSuffix(encodedTS []byte) (hlc.Timestamp, error) {
        if len(encodedTS) == 0 {
                return hlc.Timestamp{}, nil
        }
        encodedLen := len(encodedTS)
        if suffixLen := int(encodedTS[encodedLen-1]); suffixLen != encodedLen {
                return hlc.Timestamp{}, errors.Errorf(
                        "bad timestamp: found length suffix %d, actual length %d", suffixLen, encodedLen)
        }
        return decodeMVCCTimestamp(encodedTS[:encodedLen-1])
}

// MVCCRangeKey is a versioned key span.
type MVCCRangeKey struct {
        StartKey  roachpb.Key
        EndKey    roachpb.Key
        Timestamp hlc.Timestamp
        // EncodedTimestampSuffix is an optional encoded representation of Timestamp
        // as a Pebble "suffix". When reading range keys from the engine, the
        // iterator copies the verbatim encoded timestamp here. There historically
        // have been multiple representations of a timestamp that were intended to
        // be logically equivalent. A bug in CockroachDB's pebble.Comparer
        // implementation prevented some encodings from being considered equivalent.
        // See #129592.
        //
        // To work around this wart within the comparer, we preserve a copy of the
        // physical encoded timestamp we read off the engine. If a MVCCRangeKey with
        // a non-empty EncodedTimestampSuffix is cleared via ClearMVCCRangeKey, the
        // RangeKeyUnset tombstone is written with the verbatim
        // EncodedTimestampSuffix.
        EncodedTimestampSuffix []byte
}

// AsStack returns the range key as a range key stack with the given value.
func (k MVCCRangeKey) AsStack(valueRaw []byte) MVCCRangeKeyStack {
        return MVCCRangeKeyStack{
                Bounds: k.Bounds(),
                Versions: MVCCRangeKeyVersions{{
                        Timestamp:              k.Timestamp,
                        Value:                  valueRaw,
                        EncodedTimestampSuffix: k.EncodedTimestampSuffix,
                }},
        }
}

// Bounds returns the range key bounds as a Span.
func (k MVCCRangeKey) Bounds() roachpb.Span {
        return roachpb.Span{Key: k.StartKey, EndKey: k.EndKey}
}

// Clone returns a copy of the range key.
func (k MVCCRangeKey) Clone() MVCCRangeKey {
        // k is already a copy, but byte slices must be cloned.
        k.StartKey = k.StartKey.Clone()
        k.EndKey = k.EndKey.Clone()
        k.EncodedTimestampSuffix = slices.Clone(k.EncodedTimestampSuffix)
        return k
}

// Compare returns -1 if this key is less than the given key, 0 if they're
// equal, or 1 if this is greater. Comparison is by start,timestamp,end, where
// larger timestamps sort before smaller ones except empty ones which sort first
// (like elsewhere in MVCC).
func (k MVCCRangeKey) Compare(o MVCCRangeKey) int {
        if c := k.StartKey.Compare(o.StartKey); c != 0 {
                return c
        }
        if k.Timestamp.IsEmpty() && !o.Timestamp.IsEmpty() {
                return -1
        } else if !k.Timestamp.IsEmpty() && o.Timestamp.IsEmpty() {
                return 1
        } else if c := k.Timestamp.Compare(o.Timestamp); c != 0 {
                return -c // timestamps sort in reverse
        }
        return k.EndKey.Compare(o.EndKey)
}

// EncodedSize returns the encoded size of this range key. This does not
// accurately reflect the on-disk size of the key, due to Pebble range key
// stacking and fragmentation.
//
// NB: This calculation differs from MVCCKey in that MVCCKey.EncodedSize()
// incorrectly always uses 13 bytes for the timestamp while this method
// calculates the actual encoded size.
func (k MVCCRangeKey) EncodedSize() int {
        return EncodedMVCCKeyPrefixLength(k.StartKey) +
                EncodedMVCCKeyPrefixLength(k.EndKey) +
                EncodedMVCCTimestampSuffixLength(k.Timestamp)
}

// String formats the range key.
func (k MVCCRangeKey) String() string {
        s := roachpb.Span{Key: k.StartKey, EndKey: k.EndKey}.String()
        if !k.Timestamp.IsEmpty() {
                s += fmt.Sprintf("/%s", k.Timestamp)
        }
        return s
}

// Validate returns an error if the range key is invalid.
//
// This validation is for writing range keys (or checking existing range keys),
// not for filters/bounds, so e.g. specifying an empty start key is invalid even
// though it would be valid to start a range key scan at an empty start key.
func (k MVCCRangeKey) Validate() (err error) {
        defer func() {
                err = errors.Wrapf(err, "invalid range key %s", k)
        }()

        switch {
        case len(k.StartKey) == 0:
                // We don't allow an empty start key, because we don't allow writing point
                // keys at the empty key. The first valid key is 0x00.
                return errors.Errorf("no start key")
        case len(k.EndKey) == 0:
                return errors.Errorf("no end key")
        case k.Timestamp.IsEmpty():
                return errors.Errorf("no timestamp")
        case k.StartKey.Compare(k.EndKey) >= 0:
                return errors.Errorf("start key %s is at or after end key %s", k.StartKey, k.EndKey)
        default:
                return nil
        }
}

// Includes returns if this MVCCRangeKey's bounds include the specified key.
func (k MVCCRangeKey) Includes(key roachpb.Key) bool {
        return k.StartKey.Compare(key) <= 0 && k.EndKey.Compare(key) > 0
}

// Overlaps returns true if this MVCCRangeKey overlaps with the specified one.
func (k MVCCRangeKey) Overlaps(b MVCCRangeKey) bool {
        return k.StartKey.Compare(b.EndKey) < 0 && k.EndKey.Compare(b.StartKey) > 0
}

// Deletes returns whether this MVCCRangeKey deletes the specified MVCC key.
func (k MVCCRangeKey) Deletes(key MVCCKey) bool {
        return k.Includes(key.Key) && !k.Timestamp.Less(key.Timestamp)
}

// MVCCRangeKeyStack represents a stack of range key fragments as returned
// by SimpleMVCCIterator.RangeKeys(). All fragments have the same key bounds,
// and are ordered from newest to oldest.
type MVCCRangeKeyStack struct {
        Bounds   roachpb.Span
        Versions MVCCRangeKeyVersions
}

// MVCCRangeKeyVersions represents a stack of range key fragment versions.
type MVCCRangeKeyVersions []MVCCRangeKeyVersion

// MVCCRangeKeyVersion represents a single range key fragment version.
type MVCCRangeKeyVersion struct {
        Timestamp hlc.Timestamp
        Value     []byte
        // EncodedTimestampSuffix is an optional encoded representation of Timestamp
        // as a Pebble "suffix". When reading range keys from the engine, the
        // iterator copies the verbatim encoded timestamp here. There historically
        // have been multiple representations of a timestamp that were intended to
        // be logically equivalent. A bug in CockroachDB's pebble.Comparer
        // implementation prevented some encodings from being considered equivalent.
        // See #129592.
        //
        // To work around this wart within the comparer, we preserve a copy of the
        // physical encoded timestamp we read off the engine. If a MVCCRangeKey with
        // a non-empty EncodedTimestampSuffix is cleared via ClearMVCCRangeKey, the
        // RangeKeyUnset tombstone is written with the verbatim
        // EncodedTimestampSuffix.
        EncodedTimestampSuffix []byte
}

// CloneInto copies the version into the provided destination
// MVCCRangeKeyVersion, reusing and overwriting its value slice.
func (v MVCCRangeKeyVersion) CloneInto(dst *MVCCRangeKeyVersion) {
        dst.Timestamp = v.Timestamp
        dst.Value = append(dst.Value[:0], v.Value...)
        dst.EncodedTimestampSuffix = append(dst.EncodedTimestampSuffix[:0], v.EncodedTimestampSuffix...)
}

// AsRangeKey returns an MVCCRangeKey for the given version. Byte slices
// are shared with the stack.
func (s MVCCRangeKeyStack) AsRangeKey(v MVCCRangeKeyVersion) MVCCRangeKey {
        return MVCCRangeKey{
                StartKey:               s.Bounds.Key,
                EndKey:                 s.Bounds.EndKey,
                Timestamp:              v.Timestamp,
                EncodedTimestampSuffix: v.EncodedTimestampSuffix,
        }
}

// AsRangeKeys converts the stack into a slice of MVCCRangeKey. Byte slices
// are shared with the stack.
func (s MVCCRangeKeyStack) AsRangeKeys() []MVCCRangeKey {
        rangeKeys := make([]MVCCRangeKey, 0, len(s.Versions))
        for _, v := range s.Versions {
                rangeKeys = append(rangeKeys, s.AsRangeKey(v))
        }
        return rangeKeys
}

// AsRangeKeyValue returns an MVCCRangeKeyValue for the given version. Byte
// slices are shared with the stack.
func (s MVCCRangeKeyStack) AsRangeKeyValue(v MVCCRangeKeyVersion) MVCCRangeKeyValue {
        return MVCCRangeKeyValue{
                RangeKey: s.AsRangeKey(v),
                Value:    v.Value,
        }
}

// AsRangeKeyValues converts the stack into a slice of MVCCRangeKeyValue. Byte
// slices are shared with the stack.
func (s MVCCRangeKeyStack) AsRangeKeyValues() []MVCCRangeKeyValue {
        kvs := make([]MVCCRangeKeyValue, 0, len(s.Versions))
        for _, v := range s.Versions {
                kvs = append(kvs, s.AsRangeKeyValue(v))
        }
        return kvs
}

// CanMergeRight returns true if the current stack will merge with the given
// right-hand stack. The key bounds must touch exactly, i.e. the left-hand
// EndKey must equal the right-hand Key.
func (s MVCCRangeKeyStack) CanMergeRight(r MVCCRangeKeyStack) bool {
        if s.IsEmpty() || s.Len() != r.Len() || !s.Bounds.EndKey.Equal(r.Bounds.Key) {
                return false
        }
        for i := range s.Versions {
                if !s.Versions[i].Equal(r.Versions[i]) {
                        return false
                }
        }
        return true
}

// Clear clears the stack but retains the byte slices. It is useful to
// empty out a stack being used as a CloneInto() target.
func (s *MVCCRangeKeyStack) Clear() {
        s.Bounds.Key = s.Bounds.Key[:0]
        s.Bounds.EndKey = s.Bounds.EndKey[:0]
        s.Versions.Clear()
}

// Clone clones the stack.
func (s MVCCRangeKeyStack) Clone() MVCCRangeKeyStack {
        s.Bounds = s.Bounds.Clone()
        s.Versions = s.Versions.Clone()
        return s
}

// CloneInto clones the stack into the given stack reference, reusing its byte
// and version slices where possible.
//
// TODO(erikgrinaker): Consider using a single allocation for all byte slices.
// However, we currently expect the majority of range keys to have to have no
// value, so we'll typically only make two allocations for the key bounds.
func (s MVCCRangeKeyStack) CloneInto(c *MVCCRangeKeyStack) {
        c.Bounds.Key = append(c.Bounds.Key[:0], s.Bounds.Key...)
        c.Bounds.EndKey = append(c.Bounds.EndKey[:0], s.Bounds.EndKey...)
        s.Versions.CloneInto(&c.Versions)
}

// Covers returns true if any range key in the stack covers the given point key.
// A timestamp of 0 (i.e. an intent) is considered to be above all timestamps,
// and thus not covered by any range key.
func (s MVCCRangeKeyStack) Covers(k MVCCKey) bool {
        return s.Versions.Covers(k.Timestamp) && s.Bounds.ContainsKey(k.Key)
}

// CoversTimestamp returns true if any range key in the stack covers the given timestamp.
func (s MVCCRangeKeyStack) CoversTimestamp(ts hlc.Timestamp) bool {
        return s.Versions.Covers(ts)
}

// Equal returns true if the range key stacks are equal.
func (s MVCCRangeKeyStack) Equal(o MVCCRangeKeyStack) bool {
        return s.Bounds.Equal(o.Bounds) && s.Versions.Equal(o.Versions)
}

// Excise removes the versions in the given [from, to] span (inclusive, in
// order) in place, returning true if any versions were removed.
func (s *MVCCRangeKeyStack) Excise(from, to hlc.Timestamp) bool {
        return s.Versions.Excise(from, to)
}

// FirstAtOrAbove does a binary search for the first range key version at or
// above the given timestamp. Returns false if no matching range key was found.
func (s MVCCRangeKeyStack) FirstAtOrAbove(ts hlc.Timestamp) (MVCCRangeKeyVersion, bool) {
        return s.Versions.FirstAtOrAbove(ts)
}

// FirstAtOrBelow does a binary search for the first range key version at or
// below the given timestamp. Returns false if no matching range key was found.
func (s MVCCRangeKeyStack) FirstAtOrBelow(ts hlc.Timestamp) (MVCCRangeKeyVersion, bool) {
        return s.Versions.FirstAtOrBelow(ts)
}

// HasBetween checks whether an MVCC range key exists between the two given
// timestamps (both inclusive, in order).
func (s MVCCRangeKeyStack) HasBetween(lower, upper hlc.Timestamp) bool {
        return s.Versions.HasBetween(lower, upper)
}

// IsEmpty returns true if the stack is empty (no versions).
func (s MVCCRangeKeyStack) IsEmpty() bool {
        return s.Versions.IsEmpty()
}

// Len returns the number of versions in the stack.
func (s MVCCRangeKeyStack) Len() int {
        return len(s.Versions)
}

// Newest returns the timestamp of the newest range key in the stack.
func (s MVCCRangeKeyStack) Newest() hlc.Timestamp {
        return s.Versions.Newest()
}

// Oldest returns the timestamp of the oldest range key in the stack.
func (s MVCCRangeKeyStack) Oldest() hlc.Timestamp {
        return s.Versions.Oldest()
}

// Remove removes the given version from the stack, returning true if it was
// found.
func (s *MVCCRangeKeyStack) Remove(ts hlc.Timestamp) (MVCCRangeKeyVersion, bool) {
        return s.Versions.Remove(ts)
}

// String formats the MVCCRangeKeyStack as a string.
func (s MVCCRangeKeyStack) String() string {
        return fmt.Sprintf("%s%s", s.Bounds, s.Versions)
}

// Timestamps returns the timestamps of all versions.
func (s MVCCRangeKeyStack) Timestamps() []hlc.Timestamp {
        return s.Versions.Timestamps()
}

// Trim trims the versions to the time span [from, to] (both inclusive in order)
// in place. Returns true if any versions were removed.
func (s *MVCCRangeKeyStack) Trim(from, to hlc.Timestamp) bool {
        return s.Versions.Trim(from, to)
}

// Clear clears out the version stack, but retains any byte slices.
func (v *MVCCRangeKeyVersions) Clear() {
        *v = (*v)[:0]
}

// Clone clones the versions.
func (v MVCCRangeKeyVersions) Clone() MVCCRangeKeyVersions {
        c := make(MVCCRangeKeyVersions, len(v))
        for i, version := range v {
                c[i] = version.Clone()
        }
        return c
}

// CloneInto clones the versions, reusing the byte slices and backing array of
// the given slice.
func (v MVCCRangeKeyVersions) CloneInto(c *MVCCRangeKeyVersions) {
        if length, capacity := len(v), cap(*c); length > capacity {
                // Extend the slice, keeping the existing versions to reuse their Value byte
                // slices. The compiler optimizes away the intermediate, appended slice.
                *c = append((*c)[:capacity], make(MVCCRangeKeyVersions, length-capacity)...)
        } else {
                *c = (*c)[:length]
        }
        for i := range v {
                (*c)[i].Timestamp = v[i].Timestamp
                (*c)[i].Value = append((*c)[i].Value[:0], v[i].Value...)
                (*c)[i].EncodedTimestampSuffix = append((*c)[i].EncodedTimestampSuffix[:0], v[i].EncodedTimestampSuffix...)
        }
}

// Covers returns true if any version in the stack is above the given timestamp.
// A timestamp of 0 (i.e. an intent) is considered to be above all timestamps,
// and thus not covered by any range key.
func (v MVCCRangeKeyVersions) Covers(ts hlc.Timestamp) bool {
        return !v.IsEmpty() && !ts.IsEmpty() && ts.LessEq(v[0].Timestamp)
}

// Equal returns whether versions in the specified MVCCRangeKeyVersions match
// exactly (in timestamps and values) with those in itself.
func (v MVCCRangeKeyVersions) Equal(other MVCCRangeKeyVersions) bool {
        if len(v) != len(other) {
                return false
        }
        for i := range v {
                if !v[i].Equal(other[i]) {
                        return false
                }
        }
        return true
}

// Excise removes the versions in the given [from, to] span (inclusive, in
// order) in place, returning true if any versions were removed.
func (v *MVCCRangeKeyVersions) Excise(from, to hlc.Timestamp) bool {
        // We assume that to will often be near the current time, and use a linear
        // rather than a binary search, which will often match on the first range key.
        start := len(*v)
        for i, version := range *v {
                if version.Timestamp.LessEq(to) {
                        start = i
                        break
                }
        }

        // We then use a binary search to find the lower bound.
        end := sort.Search(len(*v), func(i int) bool {
                return (*v)[i].Timestamp.Less(from)
        })

        if start >= end {
                return false
        } else if start == 0 {
                *v = (*v)[end:]
        } else {
                *v = append((*v)[:start], (*v)[end:]...)
        }
        return true
}

// FirstAtOrAbove does a binary search for the first range key version at or
// above the given timestamp. Returns false if no matching range key was found.
func (v MVCCRangeKeyVersions) FirstAtOrAbove(ts hlc.Timestamp) (MVCCRangeKeyVersion, bool) {
        // This is kind of odd due to sort.Search() semantics: we do a binary search
        // for the first range key that's below the timestamp, then return the
        // previous range key if any.
        if length := len(v); length > 0 {
                if i := sort.Search(length, func(i int) bool {
                        return v[i].Timestamp.Less(ts)
                }); i > 0 {
                        return v[i-1], true
                }
        }
        return MVCCRangeKeyVersion{}, false
}

// FirstAtOrBelow does a binary search for the first range key version at or
// below the given timestamp. Returns false if no matching range key was found.
func (v MVCCRangeKeyVersions) FirstAtOrBelow(ts hlc.Timestamp) (MVCCRangeKeyVersion, bool) {
        if length := len(v); length > 0 {
                if i := sort.Search(length, func(i int) bool {
                        return v[i].Timestamp.LessEq(ts)
                }); i < length {
                        return v[i], true
                }
        }
        return MVCCRangeKeyVersion{}, false
}

// HasBetween checks whether an MVCC range key exists between the two given
// timestamps (both inclusive, in order).
func (v MVCCRangeKeyVersions) HasBetween(lower, upper hlc.Timestamp) bool {
        if version, ok := v.FirstAtOrAbove(lower); ok {
                // Consider equal timestamps to be "between". This shouldn't really happen,
                // since MVCC enforces point and range keys can't have the same timestamp.
                return version.Timestamp.LessEq(upper)
        }
        return false
}

// IsEmpty returns true if the stack is empty (no versions).
func (v MVCCRangeKeyVersions) IsEmpty() bool {
        return len(v) == 0
}

// Newest returns the timestamp of the newest range key in the stack.
func (v MVCCRangeKeyVersions) Newest() hlc.Timestamp {
        if v.IsEmpty() {
                return hlc.Timestamp{}
        }
        return v[0].Timestamp
}

// Oldest returns the timestamp of the oldest range key in the stack.
func (v MVCCRangeKeyVersions) Oldest() hlc.Timestamp {
        if v.IsEmpty() {
                return hlc.Timestamp{}
        }
        return v[len(v)-1].Timestamp
}

// Remove removes the given timestamp in place, returning it and true if it was
// found.
func (v *MVCCRangeKeyVersions) Remove(ts hlc.Timestamp) (MVCCRangeKeyVersion, bool) {
        if v.IsEmpty() {
                return MVCCRangeKeyVersion{}, false
        }
        // Fast path: check first version.
        if (*v)[0].Timestamp.Equal(ts) {
                r := (*v)[0]
                *v = (*v)[1:]
                return r, true
        }
        if i := sort.Search(len(*v), func(i int) bool {
                return (*v)[i].Timestamp.LessEq(ts)
        }); i < len(*v) && (*v)[i].Timestamp.Equal(ts) {
                r := (*v)[i]
                *v = append((*v)[:i], (*v)[i+1:]...)
                return r, true
        }
        return MVCCRangeKeyVersion{}, false
}

// String formats the MVCCRangeKeyVersions as a string.
func (v MVCCRangeKeyVersions) String() string {
        var sb strings.Builder
        sb.WriteString("[")
        for i, version := range v {
                if i > 0 {
                        sb.WriteString(" ")
                }
                sb.WriteString(version.String())
        }
        sb.WriteString("]")
        return sb.String()
}

// Timestamps returns the timestamps of all versions.
func (v MVCCRangeKeyVersions) Timestamps() []hlc.Timestamp {
        timestamps := make([]hlc.Timestamp, 0, len(v))
        for _, version := range v {
                timestamps = append(timestamps, version.Timestamp)
        }
        return timestamps
}

// Trim trims the versions to the time span [from, to] (both inclusive in
// order) in place. Returns true if any versions were removed.
func (v *MVCCRangeKeyVersions) Trim(from, to hlc.Timestamp) bool {
        var removed bool

        // We assume that to will often be near the current time, and use a linear
        // rather than a binary search, which will often match on the first range key.
        start := len(*v)
        for i, version := range *v {
                if version.Timestamp.LessEq(to) {
                        start = i
                        break
                }
        }
        *v = (*v)[start:]
        removed = start > 0

        // We then use a binary search to find the lower bound.
        if end := sort.Search(len(*v), func(i int) bool {
                return (*v)[i].Timestamp.Less(from)
        }); end < len(*v) {
                *v = (*v)[:end]
                removed = true
        }

        return removed
}

// Clone clones the version.
func (v MVCCRangeKeyVersion) Clone() MVCCRangeKeyVersion {
        if v.Value != nil {
                v.Value = append([]byte(nil), v.Value...)
        }
        if v.EncodedTimestampSuffix != nil {
                v.EncodedTimestampSuffix = append([]byte(nil), v.EncodedTimestampSuffix...)
        }
        return v
}

// Equal returns true if the two versions are equal.
func (v MVCCRangeKeyVersion) Equal(o MVCCRangeKeyVersion) bool {
        return v.Timestamp.Equal(o.Timestamp) && bytes.Equal(v.Value, o.Value)
}

// String formats the MVCCRangeKeyVersion as a string.
func (v MVCCRangeKeyVersion) String() string {
        return fmt.Sprintf("%s=%x", v.Timestamp, v.Value)
}

// EncodeMVCCTimestampSuffixWithSyntheticBitForTesting is a utility to encode
// the provided timestamp as a MVCC timestamp key suffix with the synthetic bit
// set. The synthetic bit is no longer encoded/decoded into the hlc.Timestamp
// but may exist in existing databases. This utility allows a test to construct
// a timestamp with the synthetic bit for testing appropriate handling of
// existing keys with the bit set. It should only be used in tests. See #129592.
//
// TODO(jackson): Remove this function when we've migrated all keys to unset the
// synthetic bit.
func EncodeMVCCTimestampSuffixWithSyntheticBitForTesting(ts hlc.Timestamp) []byte {
        const mvccEncodedTimestampWithSyntheticBitLen = mvccEncodedTimeWallLen +
                mvccEncodedTimeLogicalLen +
                mvccEncodedTimeSyntheticLen +
                mvccEncodedTimeLengthLen
        suffix := make([]byte, mvccEncodedTimestampWithSyntheticBitLen)
        encodeMVCCTimestampToBuf(suffix, ts)
        suffix[len(suffix)-2] = 0x01 // Synthetic bit.
        suffix[len(suffix)-1] = mvccEncodedTimestampWithSyntheticBitLen
        if decodedTS, err := DecodeMVCCTimestampSuffix(suffix); err != nil {
                panic(err)
        } else if !ts.Equal(decodedTS) {
                panic(errors.AssertionFailedf("manufactured MVCC timestamp with synthetic bit decoded to %s not %s",
                        ts, decodedTS))
        }
        return suffix
}

// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "fmt"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/util/bufalloc"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/errors"
)

// MVCCLogicalOpType is an enum with values corresponding to each of the
// enginepb.MVCCLogicalOp variants.
//
// LogLogicalOp takes an MVCCLogicalOpType and a corresponding
// MVCCLogicalOpDetails instead of an enginepb.MVCCLogicalOp variant for two
// reasons. First, it serves as a form of abstraction so that callers of the
// method don't need to construct protos themselves. More importantly, it also
// avoids allocations in the common case where Writer.LogLogicalOp is a no-op.
// This makes LogLogicalOp essentially free for cases where logical op logging
// is disabled.
type MVCCLogicalOpType int

const (
        // MVCCWriteValueOpType corresponds to the MVCCWriteValueOp variant.
        MVCCWriteValueOpType MVCCLogicalOpType = iota
        // MVCCWriteIntentOpType corresponds to the MVCCWriteIntentOp variant.
        MVCCWriteIntentOpType
        // MVCCUpdateIntentOpType corresponds to the MVCCUpdateIntentOp variant.
        MVCCUpdateIntentOpType
        // MVCCCommitIntentOpType corresponds to the MVCCCommitIntentOp variant.
        MVCCCommitIntentOpType
        // MVCCAbortIntentOpType corresponds to the MVCCAbortIntentOp variant.
        MVCCAbortIntentOpType
        // MVCCDeleteRangeOpType corresponds to the MVCCDeleteRangeOp variant.
        MVCCDeleteRangeOpType
)

// MVCCLogicalOpDetails contains details about the occurrence of an MVCC logical
// operation.
type MVCCLogicalOpDetails struct {
        Txn       enginepb.TxnMeta
        Key       roachpb.Key
        EndKey    roachpb.Key // only set for MVCCDeleteRangeOpType
        Timestamp hlc.Timestamp

        // Safe indicates that the values in this struct will never be invalidated
        // at a later point. If the details object cannot promise that its values
        // will never be invalidated, an OpLoggerBatch will make a copy of all
        // references before adding it to the log. TestMVCCOpLogWriter fails without
        // this.
        Safe bool
}

// OpLoggerBatch records a log of logical MVCC operations.
type OpLoggerBatch struct {
        Batch

        ops      []enginepb.MVCCLogicalOp
        opsAlloc bufalloc.ByteAllocator
}

// NewOpLoggerBatch creates a new batch that logs logical mvcc operations and
// wraps the provided batch.
func NewOpLoggerBatch(b Batch) *OpLoggerBatch {
        ol := &OpLoggerBatch{Batch: b}
        return ol
}

var _ Batch = &OpLoggerBatch{}

// LogLogicalOp implements the Writer interface.
func (ol *OpLoggerBatch) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
        ol.LogLogicalOpOnly(op, details)
        ol.Batch.LogLogicalOp(op, details)
}

func (ol *OpLoggerBatch) LogLogicalOpOnly(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
        if keys.IsLocal(details.Key) {
                // Ignore mvcc operations on local keys.
                if bytes.HasPrefix(details.Key, keys.LocalRangeLockTablePrefix) {
                        panic(fmt.Sprintf("seeing locktable key %s", details.Key.String()))
                }
                return
        }

        switch op {
        case MVCCWriteValueOpType:
                // Disallow inline values. Emitting these across rangefeeds doesn't make
                // sense, since they can't be ordered and won't be handled by time-bound
                // iterators in catchup scans. We could include them in the log and ignore
                // them (or error) in rangefeeds, but the cost doesn't seem worth it.
                if details.Timestamp.IsEmpty() {
                        panic(errors.AssertionFailedf("received inline key %s in MVCC logical op log", details.Key))
                }

                if !details.Safe {
                        ol.opsAlloc, details.Key = ol.opsAlloc.Copy(details.Key, 0)
                }

                ol.recordOp(&enginepb.MVCCWriteValueOp{
                        Key:       details.Key,
                        Timestamp: details.Timestamp,
                })
        case MVCCWriteIntentOpType:
                if !details.Safe {
                        ol.opsAlloc, details.Txn.Key = ol.opsAlloc.Copy(details.Txn.Key, 0)
                }

                ol.recordOp(&enginepb.MVCCWriteIntentOp{
                        TxnID:           details.Txn.ID,
                        TxnKey:          details.Txn.Key,
                        TxnIsoLevel:     details.Txn.IsoLevel,
                        TxnMinTimestamp: details.Txn.MinTimestamp,
                        Timestamp:       details.Timestamp,
                })
        case MVCCUpdateIntentOpType:
                ol.recordOp(&enginepb.MVCCUpdateIntentOp{
                        TxnID:     details.Txn.ID,
                        Timestamp: details.Timestamp,
                })
        case MVCCCommitIntentOpType:
                if !details.Safe {
                        ol.opsAlloc, details.Key = ol.opsAlloc.Copy(details.Key, 0)
                }

                ol.recordOp(&enginepb.MVCCCommitIntentOp{
                        TxnID:     details.Txn.ID,
                        Key:       details.Key,
                        Timestamp: details.Timestamp,
                })
        case MVCCAbortIntentOpType:
                ol.recordOp(&enginepb.MVCCAbortIntentOp{
                        TxnID: details.Txn.ID,
                })
        case MVCCDeleteRangeOpType:
                if !details.Safe {
                        ol.opsAlloc, details.Key = ol.opsAlloc.Copy(details.Key, 0)
                        ol.opsAlloc, details.EndKey = ol.opsAlloc.Copy(details.EndKey, 0)
                }
                ol.recordOp(&enginepb.MVCCDeleteRangeOp{
                        StartKey:  details.Key,
                        EndKey:    details.EndKey,
                        Timestamp: details.Timestamp,
                })
        default:
                panic(fmt.Sprintf("unexpected op type %v", op))
        }
}

func (ol *OpLoggerBatch) recordOp(op interface{}) {
        ol.ops = append(ol.ops, enginepb.MVCCLogicalOp{})
        ol.ops[len(ol.ops)-1].MustSetValue(op)
}

// LogicalOps returns the list of all logical MVCC operations that have been
// recorded by the logger.
func (ol *OpLoggerBatch) LogicalOps() []enginepb.MVCCLogicalOp {
        if ol == nil {
                return nil
        }
        return ol.ops
}

// DisableOpLogger disables op logging for the given read/writer.
func DisableOpLogger(rw ReadWriter) ReadWriter {
        return &noOpLogger{ReadWriter: rw}
}

type noOpLogger struct {
        ReadWriter
}

func (n *noOpLogger) LogLogicalOp(MVCCLogicalOpType, MVCCLogicalOpDetails) {
}

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "encoding/binary"
        "fmt"
        "strings"

        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/util/buildutil"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/metamorphic"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/redact"
)

const (
        extendedLenSize     = 4 // also checksumSize for roachpb.Value
        tagPos              = extendedLenSize
        tagSize             = 1
        extendedPreludeSize = extendedLenSize + tagSize

        extendedEncodingSentinel = byte(roachpb.ValueType_MVCC_EXTENDED_ENCODING_SENTINEL)
)

// MVCCValue is a versioned value, stored at an associated MVCCKey with a
// non-zero version timestamp.
//
// MVCCValue wraps a roachpb.Value and extends it with MVCC-level metadata which
// is stored in an enginepb.MVCCValueHeader struct.
//
// The mvcc value has a "simple" and an "extended" encoding scheme, depending on
// whether the value's header is empty or not. If the value's header is empty,
// it is omitted in the encoding and the mvcc value's encoding is identical to
// that of roachpb.Value. This provided backwards compatibility and ensures that
// the MVCCValue optimizes away in the common case. If the value's header is not
// empty, it is prepended to the roachpb.Value encoding. The encoding scheme's
// variants are:
//
// Simple (identical to the roachpb.Value encoding):
//
//        <4-byte-checksum><1-byte-tag><encoded-data>
//
// Extended (header prepended to roachpb.Value encoding):
//
//        <4-byte-header-len><1-byte-sentinel><mvcc-header><4-byte-checksum><1-byte-tag><encoded-data>
//
// The two encoding scheme variants are distinguished using the 5th byte, which
// is either the roachpb.Value tag (which has many values) or a sentinel tag not
// used by the roachpb.Value encoding which indicates the extended encoding
// scheme.
//
// For a deletion tombstone, the encoding of roachpb.Value is special cased to
// be empty, i.e., no checksum, tag, or encoded-data. In that case the extended
// encoding above is simply:
//
//        <4-byte-header-len><1-byte-sentinel><mvcc-header>
//
// To identify a deletion tombstone from an encoded MVCCValue, callers should
// decode the value using DecodeMVCCValue and then use the IsTombstone method.
// For example:
//
//        valRaw := iter.UnsafeValue()
//        val, err := DecodeMVCCValue(valRaw)
//        if err != nil { ... }
//        isTombstone := val.IsTombstone()
type MVCCValue struct {
        enginepb.MVCCValueHeader
        Value roachpb.Value
}

// IsTombstone returns whether the MVCCValue represents a deletion tombstone.
func (v MVCCValue) IsTombstone() bool {
        return len(v.Value.RawBytes) == 0
}

// LocalTimestampNeeded returns whether the MVCCValue's local timestamp is
// needed, or whether it can be implied by (i.e. set to the same value as)
// its key's version timestamp.
//
// TODO(erikgrinaker): Consider making this and GetLocalTimestamp() generic over
// MVCCKey and MVCCRangeKey once generics have matured a bit.
func (v MVCCValue) LocalTimestampNeeded(keyTS hlc.Timestamp) bool {
        // If the local timestamp is empty, it is assumed to be equal to the key's
        // version timestamp and so the local timestamp is not needed.
        return !v.LocalTimestamp.IsEmpty() &&
                // If the local timestamp is not empty, it is safe for the local clock
                // timestamp to be rounded down, as this will simply lead to additional
                // uncertainty restarts. In such cases, the local timestamp is not needed.
                // However, it is not safe for the local clock timestamp to be rounded up,
                // as this could lead to stale reads. As a result, in such cases, the local
                // timestamp is needed and cannot be implied by the version timestamp.
                v.LocalTimestamp.ToTimestamp().Less(keyTS)
}

// GetLocalTimestamp returns the MVCCValue's local timestamp. If the local
// timestamp is not set explicitly, its implicit value is taken from the
// provided key version timestamp and returned.
func (v MVCCValue) GetLocalTimestamp(keyTS hlc.Timestamp) hlc.ClockTimestamp {
        if v.LocalTimestamp.IsEmpty() {
                return hlc.ClockTimestamp(keyTS)
        }
        return v.LocalTimestamp
}

// String implements the fmt.Stringer interface.
func (v MVCCValue) String() string {
        return redact.StringWithoutMarkers(v)
}

// SafeFormat implements the redact.SafeFormatter interface.
func (v MVCCValue) SafeFormat(w redact.SafePrinter, _ rune) {
        if v.MVCCValueHeader != (enginepb.MVCCValueHeader{}) {
                fields := make([]string, 0)
                w.Printf("{")
                if !v.LocalTimestamp.IsEmpty() {
                        fields = append(fields, fmt.Sprintf("localTs=%s", v.LocalTimestamp))
                }
                if v.ImportEpoch != 0 {
                        fields = append(fields, fmt.Sprintf("importEpoch=%v", v.ImportEpoch))
                }
                if v.OriginID != 0 {
                        fields = append(fields, fmt.Sprintf("originID=%v", v.OriginID))
                }
                if v.OriginTimestamp.IsSet() {
                        fields = append(fields, fmt.Sprintf("originTs=%s", v.OriginTimestamp))
                }
                w.Print(strings.Join(fields, ", "))
                w.Printf("}")
        }
        w.Print(v.Value.PrettyPrint())
}

// EncodeMVCCValueForExport encodes fields from the MVCCValueHeader
// that are appropriate for export out of the cluster.
//
// The returned bool is true if the provided buffer was used or
// reallocated and false if the MVCCValue.Value.RawBytes were returned
// directly.
func EncodeMVCCValueForExport(mvccValue MVCCValue, b []byte) ([]byte, bool, error) {
        mvccValue.MVCCValueHeader.LocalTimestamp = hlc.ClockTimestamp{}
        if mvccValue.MVCCValueHeader.IsEmpty() {
                return mvccValue.Value.RawBytes, false, nil
        }
        return EncodeMVCCValueToBuf(mvccValue, b)
}

// When running a metamorphic build, disable the simple MVCC value encoding to
// prevent code from assuming that the MVCCValue encoding is identical to the
// roachpb.Value encoding.
var disableSimpleValueEncoding = metamorphic.ConstantWithTestBool(
        "mvcc-value-disable-simple-encoding", false)

// DisableMetamorphicSimpleValueEncoding disables the disableSimpleValueEncoding
// metamorphic bool for the duration of a test, resetting it at the end.
func DisableMetamorphicSimpleValueEncoding(t interface {
        Helper()
        Cleanup(func())
}) {
        t.Helper()
        if disableSimpleValueEncoding {
                disableSimpleValueEncoding = false
                t.Cleanup(func() {
                        disableSimpleValueEncoding = true
                })
        }
}

// encodedMVCCValueSize returns the size of the MVCCValue when encoded.
func encodedMVCCValueSize(v MVCCValue) int {
        if v.MVCCValueHeader.IsEmpty() && !disableSimpleValueEncoding {
                return len(v.Value.RawBytes)
        }
        return extendedPreludeSize + v.MVCCValueHeader.Size() + len(v.Value.RawBytes)
}

// EncodeMVCCValue encodes an MVCCValue into its Pebble representation. See the
// comment on MVCCValue for a description of the encoding scheme.
func EncodeMVCCValue(v MVCCValue) ([]byte, error) {
        b, _, err := EncodeMVCCValueToBuf(v, nil)
        return b, err
}

// EncodeMVCCValueToBuf encodes an MVCCValue into its Pebble
// representation. See the comment on MVCCValue for a description of
// the encoding scheme.
//
// If extended encoding is required, the given buffer will be used if
// it is large enough. If the provided buffer is not large enough a
// new buffer is allocated.
//
// The returned bool is true if the provided buffer was used or
// reallocated and false if the MVCCValue.Value.RawBytes were returned
// directly.
//
// TODO(erikgrinaker): This could mid-stack inline when we compared
// v.MVCCValueHeader == enginepb.MVCCValueHeader{} instead of IsEmpty(), but
// struct comparisons have a significant performance regression in Go 1.19 which
// negates the inlining gain. Reconsider this with Go 1.20. See:
// https://github.com/cockroachdb/cockroach/issues/88818
func EncodeMVCCValueToBuf(v MVCCValue, buf []byte) ([]byte, bool, error) {
        if v.MVCCValueHeader.IsEmpty() && !disableSimpleValueEncoding {
                // Simple encoding. Use the roachpb.Value encoding directly with no
                // modification. No need to re-allocate or copy.
                return v.Value.RawBytes, false, nil
        }

        // NB: This code is duplicated in encodeExtendedMVCCValueToSizedBuf and
        // edits should be replicated there.

        // Extended encoding. Wrap the roachpb.Value encoding with a header containing
        // MVCC-level metadata. Requires a re-allocation and copy.
        headerLen := v.MVCCValueHeader.Size()
        headerSize := extendedPreludeSize + headerLen
        valueSize := headerSize + len(v.Value.RawBytes)

        if valueSize > cap(buf) {
                buf = make([]byte, valueSize)
        } else {
                buf = buf[:valueSize]
        }
        // Extended encoding. Wrap the roachpb.Value encoding with a header containing
        // MVCC-level metadata. Requires a copy.
        // 4-byte-header-len
        binary.BigEndian.PutUint32(buf, uint32(headerLen))
        // 1-byte-sentinel
        buf[tagPos] = extendedEncodingSentinel
        // mvcc-header
        //
        // NOTE: we don't use protoutil to avoid passing v.MVCCValueHeader through
        // an interface, which would cause a heap allocation and incur the cost of
        // dynamic dispatch.
        if _, err := v.MVCCValueHeader.MarshalToSizedBuffer(buf[extendedPreludeSize:headerSize]); err != nil {
                return nil, false, errors.Wrap(err, "marshaling MVCCValueHeader")
        }
        // <4-byte-checksum><1-byte-tag><encoded-data> or empty for tombstone
        copy(buf[headerSize:], v.Value.RawBytes)
        return buf, true, nil
}

func mvccValueSize(v MVCCValue) (size int, extendedEncoding bool) {
        if v.MVCCValueHeader.IsEmpty() && !disableSimpleValueEncoding {
                return len(v.Value.RawBytes), false
        }
        return extendedPreludeSize + v.MVCCValueHeader.Size() + len(v.Value.RawBytes), true
}

// encodeExtendedMVCCValueToSizedBuf encodes an MVCCValue into its encoded form
// in the provided buffer. The provided buf must be exactly sized, matching the
// value returned by MVCCValue.encodedMVCCValueSize.
//
// See EncodeMVCCValueToBuf for detailed comments on the encoding scheme.
func encodeExtendedMVCCValueToSizedBuf(v MVCCValue, buf []byte) error {
        if buildutil.CrdbTestBuild {
                if sz := encodedMVCCValueSize(v); sz != len(buf) {
                        panic(errors.AssertionFailedf("provided buf (len=%d) is not sized correctly; expected %d", len(buf), sz))
                }
        }
        headerSize := len(buf) - len(v.Value.RawBytes)
        headerLen := headerSize - extendedPreludeSize
        binary.BigEndian.PutUint32(buf, uint32(headerLen))
        buf[tagPos] = extendedEncodingSentinel
        if _, err := v.MVCCValueHeader.MarshalToSizedBuffer(buf[extendedPreludeSize:headerSize]); err != nil {
                return errors.Wrap(err, "marshaling MVCCValueHeader")
        }
        if buildutil.CrdbTestBuild && len(buf[headerSize:]) != len(v.Value.RawBytes) {
                panic(errors.AssertionFailedf("insufficient space for raw value; expected %d, got %d", len(v.Value.RawBytes), len(buf[headerSize:])))
        }
        copy(buf[headerSize:], v.Value.RawBytes)
        return nil
}

// DecodeMVCCValue decodes an MVCCKey from its Pebble representation.
//
// NOTE: this function does not inline, so it is not suitable for performance
// critical code paths. Instead, callers that care about performance and would
// like to avoid function calls should manually call the two decoding functions.
// tryDecodeSimpleMVCCValue does inline, so callers can use it to avoid making
// any function calls when decoding an MVCCValue that is encoded with the simple
// encoding.
func DecodeMVCCValue(buf []byte) (MVCCValue, error) {
        v, ok, err := tryDecodeSimpleMVCCValue(buf)
        if ok || err != nil {
                return v, err
        }
        return decodeExtendedMVCCValue(buf, true)
}

// DecodeValueFromMVCCValue decodes and MVCCValue and returns the
// roachpb.Value portion without parsing the MVCCValueHeader.
//
// NB: Caller assumes that this function does not copy or re-allocate
// the underlying byte slice.
//
//gcassert:inline
func DecodeValueFromMVCCValue(buf []byte) (roachpb.Value, error) {
        if len(buf) == 0 {
                // Tombstone with no header.
                return roachpb.Value{}, nil
        }
        if len(buf) <= tagPos {
                return roachpb.Value{}, errMVCCValueMissingTag
        }
        if buf[tagPos] != extendedEncodingSentinel {
                return roachpb.Value{RawBytes: buf}, nil
        }

        // Extended encoding
        headerLen := binary.BigEndian.Uint32(buf)
        headerSize := extendedPreludeSize + headerLen
        if len(buf) < int(headerSize) {
                return roachpb.Value{}, errMVCCValueMissingHeader
        }
        return roachpb.Value{RawBytes: buf[headerSize:]}, nil
}

// DecodeMVCCValueAndErr is a helper that can be called using the ([]byte,
// error) pair returned from the iterator UnsafeValue(), Value() methods.
func DecodeMVCCValueAndErr(buf []byte, err error) (MVCCValue, error) {
        if err != nil {
                return MVCCValue{}, err
        }
        return DecodeMVCCValue(buf)
}

// Static error definitions, to permit inlining.
var errMVCCValueMissingTag = errors.Errorf("invalid encoded mvcc value, missing tag")
var errMVCCValueMissingHeader = errors.Errorf("invalid encoded mvcc value, missing header")

// tryDecodeSimpleMVCCValue attempts to decode an MVCCValue that is using the
// simple encoding. If successful, returns the decoded value and true. If the
// value was using the extended encoding, returns false, in which case the
// caller should call decodeExtendedMVCCValue.
//
//gcassert:inline
func tryDecodeSimpleMVCCValue(buf []byte) (MVCCValue, bool, error) {
        if len(buf) == 0 {
                // Tombstone with no header.
                return MVCCValue{}, true, nil
        }
        if len(buf) <= tagPos {
                return MVCCValue{}, false, errMVCCValueMissingTag
        }
        if buf[tagPos] != extendedEncodingSentinel {
                // Simple encoding. The encoding is equivalent to the roachpb.Value
                // encoding, so inflate it directly. No need to copy or slice.
                return MVCCValue{Value: roachpb.Value{RawBytes: buf}}, true, nil
        }
        // Extended encoding. The caller should call decodeExtendedMVCCValue.
        return MVCCValue{}, false, nil
}

//gcassert:inline
func decodeMVCCValueIgnoringHeader(buf []byte) (MVCCValue, error) {
        if len(buf) == 0 {
                return MVCCValue{}, nil
        }
        if len(buf) <= tagPos {
                return MVCCValue{}, errMVCCValueMissingTag
        }
        if buf[tagPos] != extendedEncodingSentinel {
                return MVCCValue{Value: roachpb.Value{RawBytes: buf}}, nil
        }

        // Extended encoding
        headerLen := binary.BigEndian.Uint32(buf)
        headerSize := extendedPreludeSize + headerLen
        if len(buf) < int(headerSize) {
                return MVCCValue{}, errMVCCValueMissingHeader
        }
        return MVCCValue{Value: roachpb.Value{RawBytes: buf[headerSize:]}}, nil
}

func decodeExtendedMVCCValue(buf []byte, unmarshalHeader bool) (MVCCValue, error) {
        headerLen := binary.BigEndian.Uint32(buf)
        headerSize := extendedPreludeSize + headerLen
        if len(buf) < int(headerSize) {
                return MVCCValue{}, errMVCCValueMissingHeader
        }
        var v MVCCValue
        if unmarshalHeader {
                // NOTE: we don't use protoutil to avoid passing header through an interface,
                // which would cause a heap allocation and incur the cost of dynamic dispatch.
                if err := v.MVCCValueHeader.Unmarshal(buf[extendedPreludeSize:headerSize]); err != nil {
                        return MVCCValue{}, errors.Wrapf(err, "unmarshaling MVCCValueHeader")
                }
        }
        v.Value.RawBytes = buf[headerSize:]
        return v, nil
}

// EncodedMVCCValueIsTombstone is faster than decoding a MVCCValue and then
// calling MVCCValue.IsTombstone. It should be used when the caller does not
// need a decoded value.
//
//gcassert:inline
func EncodedMVCCValueIsTombstone(buf []byte) (bool, error) {
        if len(buf) == 0 {
                return true, nil
        }
        if len(buf) <= tagPos {
                return false, errMVCCValueMissingTag
        }
        if buf[tagPos] != extendedEncodingSentinel {
                return false, nil
        }
        headerSize := extendedPreludeSize + binary.BigEndian.Uint32(buf)
        if len(buf) < int(headerSize) {
                return false, errMVCCValueMissingHeader
        }
        return len(buf) == int(headerSize), nil
}

func init() {
        // Inject the format dependency into the enginepb package.
        enginepb.FormatBytesAsValue = func(v []byte) redact.RedactableString {
                val, err := DecodeMVCCValue(v)
                if err != nil {
                        return redact.Sprintf("err=%v", err)
                }
                return redact.Sprint(val)
        }
}

// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "cmp"
        "context"
        "slices"
        "time"

        "github.com/cockroachdb/cockroach/pkg/base"
        "github.com/cockroachdb/cockroach/pkg/cloud"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/cockroach/pkg/storage/disk"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/storage/storagepb"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
        "github.com/cockroachdb/pebble/vfs"
        "github.com/cockroachdb/pebble/wal"
)

// A ConfigOption may be passed to Open to configure the storage engine.
type ConfigOption func(cfg *engineConfig) error

// CombineOptions combines many options into one.
func CombineOptions(opts ...ConfigOption) ConfigOption {
        return func(cfg *engineConfig) error {
                for _, opt := range opts {
                        if err := opt(cfg); err != nil {
                                return err
                        }
                }
                return nil
        }
}

// MustExist configures an engine to error on Open if the target directory
// does not contain an initialized store.
var MustExist ConfigOption = func(cfg *engineConfig) error {
        cfg.mustExist = true
        return nil
}

// DisableAutomaticCompactions configures an engine to be opened with disabled
// automatic compactions. Used primarily for debugCompactCmd.
var DisableAutomaticCompactions ConfigOption = func(cfg *engineConfig) error {
        cfg.opts.DisableAutomaticCompactions = true
        return nil
}

// ForceWriterParallelism configures an engine to be opened with disabled
// automatic compactions. Used primarily for debugCompactCmd.
var ForceWriterParallelism ConfigOption = func(cfg *engineConfig) error {
        cfg.opts.Experimental.ForceWriterParallelism = true
        return nil
}

// ForTesting configures the engine for use in testing. It may randomize some
// config options to improve test coverage.
var ForTesting ConfigOption = func(cfg *engineConfig) error {
        cfg.beforeClose = append(cfg.beforeClose, func(p *Pebble) {
                m := p.db.Metrics()
                if m.Keys.MissizedTombstonesCount > 0 {
                        // A missized tombstone is a Pebble DELSIZED tombstone that encodes
                        // the wrong size of the value it deletes. This kind of tombstone is
                        // written when ClearOptions.ValueSizeKnown=true. If this assertion
                        // failed, something might be awry in the code clearing the key. Are
                        // we feeding the wrong value length to ValueSize?
                        panic(errors.AssertionFailedf("expected to find 0 missized tombstones; found %d", m.Keys.MissizedTombstonesCount))
                }
        })
        return nil
}

// Attributes configures the engine's attributes.
func Attributes(attrs roachpb.Attributes) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.attrs = attrs
                return nil
        }
}

// storeSize configures the maximum allowable size for a store.
// Can be specified either as a percentage of total capacity or
// an absolute byte size; if both are specified, the percentage takes
// precedence.
type storeSize struct {
        bytes   int64
        percent float64
}

// MaxSizeBytes ets the intended maximum store size as an absolute byte
// value. MaxSizeBytes is used for calculating free space and making rebalancing
// decisions.
func MaxSizeBytes(size int64) ConfigOption {
        return maxSize(storeSize{bytes: size})
}

// MaxSizePercent ets the intended maximum store size as the specified percentage
// of total capacity. MaxSizePercent is used for calculating free space and making
// rebalancing decisions.
func MaxSizePercent(percent float64) ConfigOption {
        return maxSize(storeSize{percent: percent})
}

// maxSize sets the intended maximum store size. MaxSize is used for
// calculating free space and making rebalancing decisions. Either an
// absolute size or a percentage of total capacity can be specified;
// if both are specified, the percentage is used.
func maxSize(size storeSize) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.maxSize = size
                return nil
        }
}

// BlockSize sets the engine block size, primarily for testing purposes.
func BlockSize(size int) ConfigOption {
        return func(cfg *engineConfig) error {
                for i := range cfg.opts.Levels {
                        cfg.opts.Levels[i].BlockSize = size
                        cfg.opts.Levels[i].IndexBlockSize = size
                }
                return nil
        }
}

// TargetFileSize sets the target file size across all levels of the LSM,
// primarily for testing purposes.
func TargetFileSize(size int64) ConfigOption {
        return func(cfg *engineConfig) error {
                for i := range cfg.opts.Levels {
                        cfg.opts.Levels[i].TargetFileSize = size
                }
                return nil
        }
}

// MaxWriterConcurrency sets the concurrency of the sstable Writers. A concurrency
// of 0 implies no parallelism in the Writer, and a concurrency of 1 or more implies
// parallelism in the Writer. Currently, there's no difference between a concurrency
// of 1 or more.
func MaxWriterConcurrency(concurrency int) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.opts.Experimental.MaxWriterConcurrency = concurrency
                return nil
        }
}

// MaxOpenFiles sets the maximum number of files an engine should open.
func MaxOpenFiles(count int) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.opts.MaxOpenFiles = count
                return nil
        }

}

// CacheSize configures the size of the block cache. Note that this option is
// ignored if Caches() is also used.
func CacheSize(size int64) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.opts.CacheSize = size
                return nil
        }
}

// Caches sets the block and file caches. Useful when multiple stores share
// the same caches.
func Caches(cache *pebble.Cache, fileCache *pebble.FileCache) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.opts.Cache = cache
                cfg.opts.FileCache = fileCache
                return nil
        }
}

// BallastSize sets the amount reserved by a ballast file for manual
// out-of-disk recovery.
func BallastSize(size int64) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.ballastSize = size
                return nil
        }
}

// SharedStorage enables use of shared storage (experimental).
func SharedStorage(sharedStorage cloud.ExternalStorage) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.sharedStorage = sharedStorage
                if cfg.sharedStorage != nil && cfg.opts.FormatMajorVersion < pebble.FormatMinForSharedObjects {
                        cfg.opts.FormatMajorVersion = pebble.FormatMinForSharedObjects
                }
                return nil
        }
}

// SecondaryCache enables use of a secondary cache to store shared objects.
func SecondaryCache(size int64) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.opts.Experimental.SecondaryCacheSizeBytes = size
                return nil
        }
}

// RemoteStorageFactory enables use of remote storage (experimental).
func RemoteStorageFactory(accessor *cloud.EarlyBootExternalStorageAccessor) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.remoteStorageFactory = accessor
                return nil
        }
}

// MaxConcurrentCompactions configures the maximum number of concurrent
// compactions an Engine will execute.
func MaxConcurrentCompactions(n int) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.opts.MaxConcurrentCompactions = func() int { return n }
                return nil
        }
}

// MaxConcurrentDownloads configures the maximum number of concurrent
// download compactions an Engine will execute.
func MaxConcurrentDownloads(n int) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.opts.MaxConcurrentDownloads = func() int { return n }
                return nil
        }
}

// LBaseMaxBytes configures the maximum number of bytes for LBase.
func LBaseMaxBytes(v int64) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.opts.LBaseMaxBytes = v
                return nil
        }
}

func noopConfigOption(*engineConfig) error {
        return nil
}

func errConfigOption(err error) func(*engineConfig) error {
        return func(*engineConfig) error { return err }
}

func makeExternalWALDir(
        engineCfg *engineConfig,
        externalDir storagepb.ExternalPath,
        defaultFS vfs.FS,
        diskWriteStats disk.WriteStatsManager,
) (wal.Dir, error) {
        // If the store is encrypted, we require that all the WAL failover dirs also
        // be encrypted so that the user doesn't accidentally leak data unencrypted
        // onto the filesystem.
        if engineCfg.env.Encryption != nil && externalDir.Encryption == nil {
                return wal.Dir{}, errors.Newf("must provide --enterprise-encryption flag for %q, used as WAL failover path for encrypted store %q",
                        externalDir.Path, engineCfg.env.Dir)
        }
        if engineCfg.env.Encryption == nil && externalDir.Encryption != nil {
                return wal.Dir{}, errors.Newf("must provide --enterprise-encryption flag for store %q, specified WAL failover path %q is encrypted",
                        engineCfg.env.Dir, externalDir.Path)
        }
        env, err := fs.InitEnv(context.Background(), defaultFS, externalDir.Path, fs.EnvConfig{
                RW:                engineCfg.env.RWMode(),
                EncryptionOptions: externalDir.Encryption,
        }, diskWriteStats)
        if err != nil {
                return wal.Dir{}, err
        }
        engineCfg.afterClose = append(engineCfg.afterClose, env.Close)
        return wal.Dir{
                FS:      env,
                Dirname: externalDir.Path,
        }, nil
}

// WALFailover configures automatic failover of the engine's write-ahead log to
// another volume in the event the WAL becomes blocked on a write that does not
// complete within a reasonable duration.
func WALFailover(
        walCfg storagepb.WALFailover,
        storeEnvs fs.Envs,
        defaultFS vfs.FS,
        diskWriteStats disk.WriteStatsManager,
) ConfigOption {
        // The set of options available in single-store versus multi-store
        // configurations vary. This is in part due to the need to store the multiple
        // stores' WALs separately. When WALFailoverExplicitPath is provided, we have
        // no stable store identifier available to disambiguate the WALs of multiple
        // stores. Note that the store ID is not known when a store is first opened.
        if len(storeEnvs) == 1 {
                switch walCfg.Mode {
                case storagepb.WALFailoverMode_DEFAULT, storagepb.WALFailoverMode_AMONG_STORES:
                        return noopConfigOption
                case storagepb.WALFailoverMode_DISABLED:
                        // Check if the user provided an explicit previous path. If they did, they
                        // were previously using WALFailoverExplicitPath and are now disabling it.
                        // We need to add the explicilt path to WALRecoveryDirs.
                        if walCfg.PrevPath.IsSet() {
                                return func(cfg *engineConfig) error {
                                        walDir, err := makeExternalWALDir(cfg, walCfg.PrevPath, defaultFS, diskWriteStats)
                                        if err != nil {
                                                return err
                                        }
                                        cfg.opts.WALRecoveryDirs = append(cfg.opts.WALRecoveryDirs, walDir)
                                        return nil
                                }
                        }
                        // No PrevPath was provided. The user may be simply expressing their
                        // intent to not run with WAL failover, regardless of any future default
                        // values. If WAL failover was previously enabled, Open will error when it
                        // notices the OPTIONS file encodes a WAL failover secondary that was not
                        // provided to Options.WALRecoveryDirs.
                        return noopConfigOption
                case storagepb.WALFailoverMode_EXPLICIT_PATH:
                        // The user has provided an explicit path to which we should fail over WALs.
                        return func(cfg *engineConfig) error {
                                walDir, err := makeExternalWALDir(cfg, walCfg.Path, defaultFS, diskWriteStats)
                                if err != nil {
                                        return err
                                }
                                cfg.opts.WALFailover = makePebbleWALFailoverOptsForDir(cfg.settings, walDir)
                                if walCfg.PrevPath.IsSet() {
                                        walDir, err := makeExternalWALDir(cfg, walCfg.PrevPath, defaultFS, diskWriteStats)
                                        if err != nil {
                                                return err
                                        }
                                        cfg.opts.WALRecoveryDirs = append(cfg.opts.WALRecoveryDirs, walDir)
                                }
                                return nil
                        }
                default:
                        panic("unreachable")
                }
        }

        switch walCfg.Mode {
        case storagepb.WALFailoverMode_DEFAULT:
                // If the user specified no WAL failover setting, we default to disabling WAL
                // failover and assume that the previous process did not have WAL failover
                // enabled (so there's no need to populate Options.WALRecoveryDirs). If an
                // operator had WAL failover enabled and now wants to disable it, they must
                // explicitly set --wal-failover=disabled for the next process.
                return noopConfigOption
        case storagepb.WALFailoverMode_DISABLED:
                // Check if the user provided an explicit previous path; that's unsupported
                // in multi-store configurations.
                if walCfg.PrevPath.IsSet() {
                        return errConfigOption(errors.Newf("storage: cannot use explicit prev_path --wal-failover option with multiple stores"))
                }
                // No PrevPath was provided, implying that the user previously was using
                // WALFailoverAmongStores.

                // Fallthrough
        case storagepb.WALFailoverMode_EXPLICIT_PATH:
                // Not supported for multi-store configurations.
                return errConfigOption(errors.Newf("storage: cannot use explicit path --wal-failover option with multiple stores"))
        case storagepb.WALFailoverMode_AMONG_STORES:
                // Fallthrough
        default:
                panic("unreachable")
        }

        // Either
        // 1. mode == WALFailoverAmongStores
        //   or
        // 2. mode == WALFailoverDisabled and the user previously was using
        //    WALFailoverAmongStores, so we should build the deterministic store pairing
        //    to determine which WALRecoveryDirs to pass to which engines.
        //
        // For each store, we need to determine which store is its secondary for the
        // purpose of WALs. Even if failover is disabled, it's possible that it wasn't
        // when the previous process ran, and the secondary's wal dir may have WALs
        // that need to be replayed.
        //
        // To assign secondaries, we sort by path and dictate that the next store in
        // the slice is the secondary. Note that in-memory stores may not have unique
        // paths, in which case we fall back to using the ordering of the store flags
        // (which falls out of the use of a stable sort).
        //
        // TODO(jackson): Using the path is a simple way to assign secondaries, but
        // it's not resilient to changing between absolute and relative paths,
        // introducing symlinks, etc. Since we have the fs.Envs already available, we
        // could peek into the data directories, find the most recent OPTIONS file and
        // parse out the previous secondary if any. If we had device nos and inodes
        // available, we could deterministically sort by those instead.
        sortedEnvs := slices.Clone(storeEnvs)
        slices.SortStableFunc(sortedEnvs, func(a, b *fs.Env) int {
                return cmp.Compare(a.Dir, b.Dir)
        })

        indexOfEnv := func(e *fs.Env) (int, bool) {
                for i := range sortedEnvs {
                        if sortedEnvs[i] == e {
                                return i, true
                        }
                }
                return 0, false
        }
        return func(cfg *engineConfig) error {
                // Find the Env being opened in the slice of sorted envs.
                idx, ok := indexOfEnv(cfg.env)
                if !ok {
                        panic(errors.AssertionFailedf("storage: opening a store with an unrecognized filesystem Env (dir=%s)", cfg.env.Dir))
                }
                // Ensure that either all the stores are encrypted, or none are.
                for _, storeEnv := range sortedEnvs {
                        if (storeEnv.Encryption == nil) != (cfg.env.Encryption == nil) {
                                return errors.Newf("storage: must provide --enterprise-encryption flag for all stores or none if using WAL failover")
                        }
                }

                failoverIdx := (idx + 1) % len(sortedEnvs)
                secondaryEnv := sortedEnvs[failoverIdx]

                // Ref once to ensure the secondary Env isn't closed before this Engine has
                // been closed if the secondary's corresponding Engine is closed first.
                secondaryEnv.Ref()
                cfg.afterClose = append(cfg.afterClose, secondaryEnv.Close)

                secondary := wal.Dir{
                        FS: secondaryEnv,
                        // Use auxiliary/wals-among-stores within the other stores directory.
                        Dirname: secondaryEnv.PathJoin(secondaryEnv.Dir, base.AuxiliaryDir, "wals-among-stores"),
                }
                if walCfg.Mode == storagepb.WALFailoverMode_AMONG_STORES {
                        cfg.opts.WALFailover = makePebbleWALFailoverOptsForDir(cfg.settings, secondary)
                        return nil
                }
                // mode == WALFailoverDisabled
                cfg.opts.WALRecoveryDirs = append(cfg.opts.WALRecoveryDirs, secondary)
                return nil
        }
}

func makePebbleWALFailoverOptsForDir(
        settings *cluster.Settings, dir wal.Dir,
) *pebble.WALFailoverOptions {
        return &pebble.WALFailoverOptions{
                Secondary: dir,
                FailoverOptions: wal.FailoverOptions{
                        // Leave most the options to their defaults, but
                        // UnhealthyOperationLatencyThreshold should be pulled from the
                        // cluster setting.
                        UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) {
                                return walFailoverUnhealthyOpThreshold.Get(&settings.SV), true
                        },
                },
        }
}

// PebbleOptions contains Pebble-specific options in the same format as a
// Pebble OPTIONS file. For example:
// [Options]
// delete_range_flush_delay=2s
// flush_split_bytes=4096
func PebbleOptions(pebbleOptions string, parseHooks *pebble.ParseHooks) ConfigOption {
        return func(cfg *engineConfig) error {
                return cfg.opts.Parse(pebbleOptions, parseHooks)
        }
}

// DiskMonitor configures a monitor to track disk stats.
func DiskMonitor(diskMonitor *disk.Monitor) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.diskMonitor = diskMonitor
                return nil
        }
}

// DiskWriteStatsCollector configures an engine to categorically track disk write stats.
func DiskWriteStatsCollector(dsc *vfs.DiskWriteStatsCollector) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.DiskWriteStatsCollector = dsc
                return nil
        }
}

// BlockConcurrencyLimitDivisor sets the divisor used to calculate the block
// load concurrency limit: the current value of the BlockLoadConcurrencyLimit
// setting divided by the divisor. It should be set to the number of stores.
//
// A value of 0 disables the limiter.
func BlockConcurrencyLimitDivisor(d int) ConfigOption {
        return func(cfg *engineConfig) error {
                cfg.blockConcurrencyLimitDivisor = d
                return nil
        }
}

// If enables the given option if enable is true.
func If(enable bool, opt ConfigOption) ConfigOption {
        if enable {
                return opt
        }
        return func(cfg *engineConfig) error { return nil }
}

// InMemory re-exports fs.InMemory.
//
// TODO(jackson): Update callers to use fs.InMemory directly.
var InMemory = fs.InMemory

// Open opens a new Pebble storage engine, reading and writing data to the
// provided fs.Env, configured with the provided options.
//
// If successful, the returned Engine takes ownership over the provided fs.Env's
// reference. When the Engine is closed, the fs.Env is closed once too. If the
// Env must be retained beyond the Engine's lifetime, the caller should Ref() it
// first.
func Open(
        ctx context.Context, env *fs.Env, settings *cluster.Settings, opts ...ConfigOption,
) (*Pebble, error) {
        if settings == nil {
                return nil, errors.AssertionFailedf("Open requires non-nil *cluster.Settings")
        }
        var cfg engineConfig
        cfg.env = env
        cfg.settings = settings
        cfg.opts = DefaultPebbleOptions()
        cfg.opts.FS = env
        cfg.opts.ReadOnly = env.IsReadOnly()
        for _, opt := range opts {
                if err := opt(&cfg); err != nil {
                        // Run after-close hooks if there are any. This ensures we
                        // release any references to fs.Envs that would've been held by
                        // the engine if it had been successfully opened.
                        for _, f := range cfg.afterClose {
                                f()
                        }
                        return nil, err
                }
        }
        p, err := newPebble(ctx, cfg)
        if err != nil {
                // Run after-close hooks if there are any. This ensures we
                // release any references to fs.Envs that would've been held by
                // the engine if it had been successfully opened.
                for _, f := range cfg.afterClose {
                        f()
                }
                return nil, err
        }
        return p, nil
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"
        "encoding/binary"
        "encoding/json"
        "fmt"
        "math"
        "os"
        "path/filepath"
        "sort"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cockroachdb/cockroach/pkg/base"
        "github.com/cockroachdb/cockroach/pkg/cloud"
        "github.com/cockroachdb/cockroach/pkg/clusterversion"
        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/cockroach/pkg/storage/disk"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter"
        "github.com/cockroachdb/cockroach/pkg/util/buildutil"
        "github.com/cockroachdb/cockroach/pkg/util/envutil"
        "github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
        "github.com/cockroachdb/cockroach/pkg/util/log"
        "github.com/cockroachdb/cockroach/pkg/util/metamorphic"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
        "github.com/cockroachdb/cockroach/pkg/util/timeutil"
        "github.com/cockroachdb/cockroach/pkg/util/tracing"
        "github.com/cockroachdb/crlib/fifo"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/errors/oserror"
        "github.com/cockroachdb/logtags"
        "github.com/cockroachdb/pebble"
        "github.com/cockroachdb/pebble/bloom"
        "github.com/cockroachdb/pebble/cockroachkvs"
        "github.com/cockroachdb/pebble/objstorage/objstorageprovider"
        "github.com/cockroachdb/pebble/objstorage/remote"
        "github.com/cockroachdb/pebble/rangekey"
        "github.com/cockroachdb/pebble/replay"
        "github.com/cockroachdb/pebble/sstable"
        "github.com/cockroachdb/pebble/sstable/block"
        "github.com/cockroachdb/pebble/vfs"
        humanize "github.com/dustin/go-humanize"
)

// UseEFOS controls whether uses of pebble Snapshots should use
// EventuallyFileOnlySnapshots instead. This reduces write-amp with the main
// tradeoff being higher space-amp. Note that UseExciseForSnapshot, if true,
// effectively causes EventuallyFileOnlySnapshots to be used as well.
//
// Note: Do NOT read this setting directly. Use ShouldUseEFOS() instead.
var UseEFOS = settings.RegisterBoolSetting(
        settings.SystemOnly,
        "storage.experimental.eventually_file_only_snapshots.enabled",
        "set to false to disable eventually-file-only-snapshots (kv.snapshot_receiver.excise.enabled must also be false)",
        metamorphic.ConstantWithTestBool(
                "storage.experimental.eventually_file_only_snapshots.enabled", true), /* defaultValue */
        settings.WithPublic)

// UseExciseForSnapshots controls whether virtual-sstable-based excises should
// be used instead of range deletions for clearing out replica contents as part
// of a rebalance/recovery snapshot application. Applied on the receiver side.
// Note that setting this setting to true also effectively causes UseEFOS above
// to become true. This interaction is why this setting is defined in the
// storage package even though it mostly affects KV.
var UseExciseForSnapshots = settings.RegisterBoolSetting(
        settings.SystemOnly,
        "kv.snapshot_receiver.excise.enabled",
        "set to false to disable excises in place of range deletions for KV snapshots",
        metamorphic.ConstantWithTestBool(
                "kv.snapshot_receiver.excise.enabled", true), /* defaultValue */
        settings.WithPublic,
)

// IngestSplitEnabled controls whether ingest-time splitting is enabled in
// Pebble. This feature allows for existing sstables to be split into multiple
// virtual sstables at ingest time if that allows for an ingestion sstable to go
// into a lower level than it would otherwise be in. No keys are masked with
// this split; it only happens if there are no keys in that existing sstable
// in the span of the incoming sstable.
var IngestSplitEnabled = settings.RegisterBoolSetting(
        settings.SystemOnly,
        "storage.ingest_split.enabled",
        "set to false to disable ingest-time splitting that lowers write-amplification",
        metamorphic.ConstantWithTestBool(
                "storage.ingest_split.enabled", true), /* defaultValue */
        settings.WithPublic,
)

// ColumnarBlocksEnabled controls whether columnar-blocks are enabled in Pebble.
var ColumnarBlocksEnabled = settings.RegisterBoolSetting(
        settings.SystemVisible,
        "storage.columnar_blocks.enabled",
        "set to true to enable columnar-blocks to store KVs in a columnar format",
        metamorphic.ConstantWithTestBool(
                "storage.columnar_blocks.enabled", true /* defaultValue */),
        settings.WithPublic,
)

// deleteCompactionsCanExcise controls whether delete compactions can
// apply rangedels/rangekeydels on sstables they partially apply to, through
// an excise operation, instead of just applying the rangedels/rangekeydels
// that fully delete sstables.
var deleteCompactionsCanExcise = settings.RegisterBoolSetting(
        settings.SystemVisible,
        "storage.delete_compaction_excise.enabled",
        "set to false to direct Pebble to not partially excise sstables in delete-only compactions",
        metamorphic.ConstantWithTestBool(
                "storage.delete_compaction_excise.enabled", true), /* defaultValue */
        settings.WithPublic)

// IngestAsFlushable controls whether ingested sstables that overlap the
// memtable may be lazily ingested: written to the WAL and enqueued in the list
// of flushables (eg, memtables, large batches and now lazily-ingested
// sstables). This only affects sstables that are ingested in the future. If a
// sstable was already lazily ingested but not flushed, a crash and subsequent
// recovery will still enqueue the sstables as flushable when the ingest's WAL
// entry is replayed.
//
// This cluster setting will be removed in a subsequent release.
var IngestAsFlushable = settings.RegisterBoolSetting(
        settings.ApplicationLevel, // used to init temp storage in virtual cluster servers
        "storage.ingest_as_flushable.enabled",
        "set to true to enable lazy ingestion of sstables",
        metamorphic.ConstantWithTestBool(
                "storage.ingest_as_flushable.enabled", true))

// MinCapacityForBulkIngest is the fraction of remaining store capacity
// under which bulk ingestion requests are rejected.
var MinCapacityForBulkIngest = settings.RegisterFloatSetting(
        settings.SystemOnly,
        "kv.bulk_io_write.min_capacity_remaining_fraction",
        "remaining store capacity fraction below which bulk ingestion requests are rejected",
        0.05,
        settings.FloatInRange(0.04, 0.3),
        settings.WithPublic,
)

// BlockLoadConcurrencyLimit controls the maximum number of outstanding
// filesystem read operations for loading sstable blocks. This limit is a
// last-resort queueing mechanism to avoid memory issues or running against the
// Go OS system threads limit (see runtime.SetMaxThreads() with default value
// 10,000).
//
// The limit is distributed evenly between all stores (rounding up). This is to
// provide isolation between the stores - we don't want one bad disk blocking
// other stores.
var BlockLoadConcurrencyLimit = settings.RegisterIntSetting(
        settings.ApplicationLevel, // used by temp storage as well
        "storage.block_load.node_max_active",
        "maximum number of outstanding sstable block reads per host",
        7500,
        settings.IntInRange(1, 9000),
)

// readaheadModeInformed controls the pebble.ReadaheadConfig.Informed setting.
//
// Note that the setting is taken into account when a table enters the Pebble
// table cache; it can take a while for an updated setting to take effect.
var readaheadModeInformed = settings.RegisterEnumSetting(
        settings.ApplicationLevel, // used by temp storage as well
        "storage.readahead_mode.informed",
        "the readahead mode for operations which are known to read through large chunks of data; "+
                "sys-readahead performs explicit prefetching via the readahead syscall; "+
                "fadv-sequential lets the OS perform prefetching via fadvise(FADV_SEQUENTIAL)",
        "fadv-sequential",
        map[objstorageprovider.ReadaheadMode]string{
                objstorageprovider.NoReadahead:       "off",
                objstorageprovider.SysReadahead:      "sys-readahead",
                objstorageprovider.FadviseSequential: "fadv-sequential",
        },
)

// readaheadModeSpeculative controls the pebble.ReadaheadConfig.Speculative setting.
//
// Note that the setting is taken into account when a table enters the Pebble
// table cache; it can take a while for an updated setting to take effect.
var readaheadModeSpeculative = settings.RegisterEnumSetting(
        settings.ApplicationLevel, // used by temp storage as well
        "storage.readahead_mode.speculative",
        "the readahead mode that is used automatically when sequential reads are detected; "+
                "sys-readahead performs explicit prefetching via the readahead syscall; "+
                "fadv-sequential starts with explicit prefetching via the readahead syscall then automatically "+
                "switches to OS-driven prefetching via fadvise(FADV_SEQUENTIAL)",
        "fadv-sequential",
        map[objstorageprovider.ReadaheadMode]string{
                objstorageprovider.NoReadahead:       "off",
                objstorageprovider.SysReadahead:      "sys-readahead",
                objstorageprovider.FadviseSequential: "fadv-sequential",
        },
)

// CompressionAlgorithm is an enumeration of available compression algorithms
// available.
type compressionAlgorithm int64

const (
        compressionAlgorithmSnappy compressionAlgorithm = 1
        compressionAlgorithmZstd   compressionAlgorithm = 2
        compressionAlgorithmNone   compressionAlgorithm = 3
)

// String implements fmt.Stringer for CompressionAlgorithm.
func (c compressionAlgorithm) String() string {
        switch c {
        case compressionAlgorithmSnappy:
                return "snappy"
        case compressionAlgorithmZstd:
                return "zstd"
        case compressionAlgorithmNone:
                return "none"
        default:
                panic(errors.Errorf("unknown compression type: %d", c))
        }
}

// RegisterCompressionAlgorithmClusterSetting is a helper to register an enum
// cluster setting with the given name, description and default value.
func RegisterCompressionAlgorithmClusterSetting(
        name settings.InternalKey, desc string, defaultValue compressionAlgorithm,
) *settings.EnumSetting[compressionAlgorithm] {
        return settings.RegisterEnumSetting(
                // NB: We can't use settings.SystemOnly today because we may need to read the
                // value from within a tenant building an sstable for AddSSTable.
                settings.SystemVisible, name,
                desc,
                // TODO(jackson): Consider using a metamorphic constant here, but many tests
                // will need to override it because they depend on a deterministic sstable
                // size.
                defaultValue.String(),
                map[compressionAlgorithm]string{
                        compressionAlgorithmSnappy: compressionAlgorithmSnappy.String(),
                        compressionAlgorithmZstd:   compressionAlgorithmZstd.String(),
                        compressionAlgorithmNone:   compressionAlgorithmNone.String(),
                },
                settings.WithPublic,
        )
}

// CompressionAlgorithmStorage determines the compression algorithm used to
// compress data blocks when writing sstables for use in a Pebble store (written
// directly, or constructed for ingestion on a remote store via AddSSTable).
// Users should call getCompressionAlgorithm with the cluster setting, rather
// than calling Get directly.
var CompressionAlgorithmStorage = RegisterCompressionAlgorithmClusterSetting(
        "storage.sstable.compression_algorithm",
        `determines the compression algorithm to use when compressing sstable data blocks for use in a Pebble store;`,
        compressionAlgorithmSnappy, // Default.
)

// CompressionAlgorithmBackupStorage determines the compression algorithm used
// to compress data blocks when writing sstables that contain backup row data
// storage. Users should call getCompressionAlgorithm with the cluster setting,
// rather than calling Get directly.
var CompressionAlgorithmBackupStorage = RegisterCompressionAlgorithmClusterSetting(
        "storage.sstable.compression_algorithm_backup_storage",
        `determines the compression algorithm to use when compressing sstable data blocks for backup row data storage;`,
        compressionAlgorithmSnappy, // Default.
)

// CompressionAlgorithmBackupTransport determines the compression algorithm used
// to compress data blocks when writing sstables that will be immediately
// iterated and will never need to touch disk. These sstables typically have
// much larger blocks and benefit from compression. However, this compression
// algorithm may be different to the one used when writing out the sstables for
// remote storage. Users should call getCompressionAlgorithm with the cluster
// setting, rather than calling Get directly.
var CompressionAlgorithmBackupTransport = RegisterCompressionAlgorithmClusterSetting(
        "storage.sstable.compression_algorithm_backup_transport",
        `determines the compression algorithm to use when compressing sstable data blocks for backup transport;`,
        compressionAlgorithmSnappy, // Default.
)

func getCompressionAlgorithm(
        ctx context.Context,
        settings *cluster.Settings,
        setting *settings.EnumSetting[compressionAlgorithm],
) pebble.Compression {
        switch setting.Get(&settings.SV) {
        case compressionAlgorithmSnappy:
                return pebble.SnappyCompression
        case compressionAlgorithmZstd:
                return pebble.ZstdCompression
        case compressionAlgorithmNone:
                return pebble.NoCompression
        default:
                return pebble.DefaultCompression
        }
}

var walFailoverUnhealthyOpThreshold = settings.RegisterDurationSetting(
        settings.SystemOnly,
        "storage.wal_failover.unhealthy_op_threshold",
        "the latency of a WAL write considered unhealthy and triggers a failover to a secondary WAL location",
        100*time.Millisecond,
        settings.WithPublic,
)

// TODO(ssd): This could be SystemOnly but we currently init pebble
// engines for temporary storage. Temporary engines shouldn't really
// care about download compactions, but they do currently simply
// because of code organization.
var concurrentDownloadCompactions = settings.RegisterIntSetting(
        settings.ApplicationLevel,
        "storage.max_download_compaction_concurrency",
        "the maximum number of concurrent download compactions",
        8,
        settings.IntWithMinimum(1),
)

// ShouldUseEFOS returns true if either of the UseEFOS or UseExciseForSnapshots
// cluster settings are enabled, and EventuallyFileOnlySnapshots must be used
// to guarantee snapshot-like semantics.
func ShouldUseEFOS(settings *settings.Values) bool {
        return UseEFOS.Get(settings) || UseExciseForSnapshots.Get(settings)
}

// EngineComparer is a pebble.Comparer object that implements MVCC-specific
// comparator settings for use with Pebble.
var EngineComparer = func() pebble.Comparer {
        // We use the pebble/cockroachkvs implementation, but we override the
        // FormatKey method.
        c := cockroachkvs.Comparer

        c.FormatKey = func(k []byte) fmt.Formatter {
                decoded, ok := DecodeEngineKey(k)
                if !ok {
                        return mvccKeyFormatter{err: errors.Errorf("invalid encoded engine key: %x", k)}
                }
                if decoded.IsMVCCKey() {
                        mvccKey, err := decoded.ToMVCCKey()
                        if err != nil {
                                return mvccKeyFormatter{err: err}
                        }
                        return mvccKeyFormatter{key: mvccKey}
                }
                return EngineKeyFormatter{key: decoded}
        }
        // TODO(jackson): Consider overriding ValidateKey and using the stricter
        // EngineKey.Validate. Today some tests create lock-table keys without the
        // lock table prefix and these test keys fail EngineKey.Validate.
        return c
}()

// KeySchemas holds the set of KeySchemas understandable by CockroachDB.
var KeySchemas = []*pebble.KeySchema{&cockroachkvs.KeySchema}

// TODO(jackson): We need to rethink uses of DefaultKeySchema when we introduce
// a new key schema.

// DefaultKeySchema is the name of the default key schema.
var DefaultKeySchema = cockroachkvs.KeySchema.Name

// MVCCMerger is a pebble.Merger object that implements the merge operator used
// by Cockroach.
var MVCCMerger = &pebble.Merger{
        Name: "cockroach_merge_operator",
        Merge: func(_, value []byte) (pebble.ValueMerger, error) {
                merger := NewMVCCValueMerger()
                err := merger.MergeNewer(value)
                if err != nil {
                        return nil, err
                }
                return merger, nil
        },
}

var _ sstable.BlockIntervalSuffixReplacer = MVCCBlockIntervalSuffixReplacer{}

type MVCCBlockIntervalSuffixReplacer struct{}

func (MVCCBlockIntervalSuffixReplacer) ApplySuffixReplacement(
        interval sstable.BlockInterval, newSuffix []byte,
) (sstable.BlockInterval, error) {
        synthDecoded, err := DecodeMVCCTimestampSuffix(newSuffix)
        if err != nil {
                return sstable.BlockInterval{}, errors.AssertionFailedf("could not decode synthetic suffix")
        }
        synthDecodedWalltime := uint64(synthDecoded.WallTime)
        // The returned bound includes the synthetic suffix, regardless of its logical
        // component.
        return sstable.BlockInterval{Lower: synthDecodedWalltime, Upper: synthDecodedWalltime + 1}, nil
}

type pebbleIntervalMapper struct{}

var _ sstable.IntervalMapper = pebbleIntervalMapper{}

// MapPointKey is part of the sstable.IntervalMapper interface.
func (pebbleIntervalMapper) MapPointKey(
        key pebble.InternalKey, value []byte,
) (sstable.BlockInterval, error) {
        return mapSuffixToInterval(key.UserKey)
}

// MapRangeKey is part of the sstable.IntervalMapper interface.
func (pebbleIntervalMapper) MapRangeKeys(span sstable.Span) (sstable.BlockInterval, error) {
        var res sstable.BlockInterval
        for _, k := range span.Keys {
                i, err := mapSuffixToInterval(k.Suffix)
                if err != nil {
                        return sstable.BlockInterval{}, err
                }
                res.UnionWith(i)
        }
        return res, nil
}

// mapSuffixToInterval maps the suffix of a key to a timestamp interval.
// The buffer can be an entire key or just the suffix.
func mapSuffixToInterval(b []byte) (sstable.BlockInterval, error) {
        if len(b) == 0 {
                return sstable.BlockInterval{}, nil
        }
        // Last byte is the version length + 1 when there is a version,
        // else it is 0.
        versionLen := int(b[len(b)-1])
        if versionLen == 0 {
                // This is not an MVCC key that we can collect.
                return sstable.BlockInterval{}, nil
        }
        // prefixPartEnd points to the sentinel byte, unless this is a bare suffix, in
        // which case the index is -1.
        prefixPartEnd := len(b) - 1 - versionLen
        // Sanity check: the index should be >= -1. Additionally, if the index is >=
        // 0, it should point to the sentinel byte, as this is a full EngineKey.
        if prefixPartEnd < -1 || (prefixPartEnd >= 0 && b[prefixPartEnd] != sentinel) {
                return sstable.BlockInterval{}, errors.Errorf("invalid key %s", roachpb.Key(b).String())
        }
        // We don't need the last byte (the version length).
        versionLen--
        // Only collect if this looks like an MVCC timestamp.
        if versionLen == engineKeyVersionWallTimeLen ||
                versionLen == engineKeyVersionWallAndLogicalTimeLen ||
                versionLen == engineKeyVersionWallLogicalAndSyntheticTimeLen {
                // INVARIANT: -1 <= prefixPartEnd < len(b) - 1.
                // Version consists of the bytes after the sentinel and before the length.
                ts := binary.BigEndian.Uint64(b[prefixPartEnd+1:])
                return sstable.BlockInterval{Lower: ts, Upper: ts + 1}, nil
        }
        return sstable.BlockInterval{}, nil
}

const mvccWallTimeIntervalCollector = "MVCCTimeInterval"

var _ pebble.BlockPropertyFilterMask = (*mvccWallTimeIntervalRangeKeyMask)(nil)

type mvccWallTimeIntervalRangeKeyMask struct {
        sstable.BlockIntervalFilter
}

// SetSuffix implements the pebble.BlockPropertyFilterMask interface.
func (m *mvccWallTimeIntervalRangeKeyMask) SetSuffix(suffix []byte) error {
        if len(suffix) == 0 {
                // This is currently impossible, because the only range key Cockroach
                // writes today is the MVCC Delete Range that's always suffixed.
                return nil
        }
        ts, err := DecodeMVCCTimestampSuffix(suffix)
        if err != nil {
                return err
        }
        m.BlockIntervalFilter.SetInterval(uint64(ts.WallTime), math.MaxUint64)
        return nil
}

// PebbleBlockPropertyCollectors is the list of functions to construct
// BlockPropertyCollectors.
var PebbleBlockPropertyCollectors = []func() pebble.BlockPropertyCollector{
        func() pebble.BlockPropertyCollector {
                return sstable.NewBlockIntervalCollector(
                        mvccWallTimeIntervalCollector,
                        pebbleIntervalMapper{},
                        MVCCBlockIntervalSuffixReplacer{},
                )
        },
}

// MinimumSupportedFormatVersion is the version that provides features that the
// Cockroach code relies on unconditionally (like range keys). New stores are by
// default created with this version. It should correspond to the minimum
// supported binary version.
const MinimumSupportedFormatVersion = pebble.FormatColumnarBlocks

// DefaultPebbleOptions returns the default pebble options.
func DefaultPebbleOptions() *pebble.Options {
        opts := &pebble.Options{
                Comparer:   &EngineComparer,
                FS:         vfs.Default,
                KeySchema:  DefaultKeySchema,
                KeySchemas: sstable.MakeKeySchemas(KeySchemas...),
                // A value of 2 triggers a compaction when there is 1 sub-level.
                L0CompactionThreshold: 2,
                L0StopWritesThreshold: 1000,
                LBaseMaxBytes:         64 << 20, // 64 MB
                Levels:                make([]pebble.LevelOptions, 7),
                // NB: Options.MaxConcurrentCompactions may be overidden in NewPebble to
                // allow overriding the max at runtime through
                // Engine.SetCompactionConcurrency.
                MaxConcurrentCompactions:    getMaxConcurrentCompactions,
                MemTableSize:                64 << 20, // 64 MB
                MemTableStopWritesThreshold: 4,
                Merger:                      MVCCMerger,
                BlockPropertyCollectors:     PebbleBlockPropertyCollectors,
                FormatMajorVersion:          MinimumSupportedFormatVersion,
        }
        opts.Experimental.L0CompactionConcurrency = l0SubLevelCompactionConcurrency
        // Automatically flush 10s after the first range tombstone is added to a
        // memtable. This ensures that we can reclaim space even when there's no
        // activity on the database generating flushes.
        opts.FlushDelayDeleteRange = 10 * time.Second
        // Automatically flush 10s after the first range key is added to a memtable.
        // This ensures that range keys are quickly flushed, allowing use of lazy
        // combined iteration within Pebble.
        opts.FlushDelayRangeKey = 10 * time.Second
        // Enable deletion pacing. This helps prevent disk slowness events on some
        // SSDs, that kick off an expensive GC if a lot of files are deleted at
        // once.
        opts.TargetByteDeletionRate = 128 << 20 // 128 MB
        opts.Experimental.ShortAttributeExtractor = shortAttributeExtractorForValues
        opts.Experimental.RequiredInPlaceValueBound = pebble.UserKeyPrefixBound{
                Lower: EncodeMVCCKey(MVCCKey{Key: keys.LocalRangeLockTablePrefix}),
                Upper: EncodeMVCCKey(MVCCKey{Key: keys.LocalRangeLockTablePrefix.PrefixEnd()}),
        }
        // Disable multi-level compaction heuristic for now. See #134423
        // for why this was disabled, and what needs to be changed to reenable it.
        // This issue tracks re-enablement: https://github.com/cockroachdb/pebble/issues/4139
        opts.Experimental.MultiLevelCompactionHeuristic = pebble.NoMultiLevel{}

        opts.Experimental.UserKeyCategories = userKeyCategories

        for i := 0; i < len(opts.Levels); i++ {
                l := &opts.Levels[i]
                l.BlockSize = 32 << 10       // 32 KB
                l.IndexBlockSize = 256 << 10 // 256 KB
                l.FilterPolicy = bloom.FilterPolicy(10)
                l.FilterType = pebble.TableFilter
                if i > 0 {
                        l.TargetFileSize = opts.Levels[i-1].TargetFileSize * 2
                }
                l.EnsureDefaults()
        }

        // These size classes are a subset of available size classes in jemalloc[1].
        // The size classes are used by Pebble for determining target block sizes for
        // flushes, with the goal of reducing internal fragmentation. There are many
        // more size classes that could be included, however, sstable blocks have a
        // target block size of 32KiB, a minimum size threshold of ~19.6KiB and are
        // unlikely to exceed 128KiB.
        //
        // [1] https://jemalloc.net/jemalloc.3.html#size_classes
        opts.AllocatorSizeClasses = []int{
                16384,
                20480, 24576, 28672, 32768,
                40960, 49152, 57344, 65536,
                81920, 98304, 114688, 131072,
        }

        return opts
}

func shortAttributeExtractorForValues(
        key []byte, keyPrefixLen int, value []byte,
) (pebble.ShortAttribute, error) {
        suffixLen := len(key) - keyPrefixLen
        const lockTableSuffixLen = engineKeyVersionLockTableLen + sentinelLen
        if suffixLen == engineKeyNoVersion || suffixLen == lockTableSuffixLen {
                // Not a versioned MVCC value.
                return 0, nil
        }
        isTombstone, err := EncodedMVCCValueIsTombstone(value)
        if err != nil {
                return 0, err
        }
        if isTombstone {
                return 1, nil
        }
        return 0, nil
}

// engineConfig holds all configuration parameters and knobs used in setting up
// a new storage engine.
type engineConfig struct {
        attrs roachpb.Attributes
        // ballastSize is the amount reserved by a ballast file for manual
        // out-of-disk recovery.
        ballastSize int64
        // env holds the initialized virtual filesystem that the Engine should use.
        env *fs.Env
        // maxSize is used for calculating free space and making rebalancing
        // decisions. The zero value indicates that there is no absolute maximum size.
        maxSize storeSize
        // If true, creating the instance fails if the target directory does not hold
        // an initialized instance.
        //
        // Makes no sense for in-memory instances.
        mustExist bool
        // pebble specific options.
        opts *pebble.Options
        // remoteStorageFactory is used to pass the ExternalStorage factory.
        remoteStorageFactory *cloud.EarlyBootExternalStorageAccessor
        // settings instance for cluster-wide knobs. Must not be nil.
        settings *cluster.Settings
        // sharedStorage is a cloud.ExternalStorage that can be used by all Pebble
        // stores on this node and on other nodes to store sstables.
        sharedStorage cloud.ExternalStorage

        // beforeClose is a slice of functions to be invoked before the engine is closed.
        beforeClose []func(*Pebble)
        // afterClose is a slice of functions to be invoked after the engine is closed.
        afterClose []func()

        // diskMonitor is used to output a disk trace when a stall is detected.
        diskMonitor *disk.Monitor

        // DiskWriteStatsCollector is used to categorically track disk write metrics
        // across all Pebble stores on this node.
        DiskWriteStatsCollector *vfs.DiskWriteStatsCollector

        // blockConcurrencyLimitDivisor is used to calculate the block load
        // concurrency limit: it is the current valuer of the
        // BlockLoadConcurrencyLimit setting divided by this value. It should be set
        // to the number of stores.
        //
        // This is necessary because we want separate limiters per stores (we don't
        // want one bad disk to block other stores)
        //
        // A value of 0 disables the limit.
        blockConcurrencyLimitDivisor int
}

// Pebble is a wrapper around a Pebble database instance.
type Pebble struct {
        atomic struct {
                // compactionConcurrency is the current compaction concurrency set on
                // the Pebble store. The compactionConcurrency option in the Pebble
                // Options struct is a closure which will return
                // Pebble.atomic.compactionConcurrency.
                //
                // This mechanism allows us to change the Pebble compactionConcurrency
                // on the fly without restarting Pebble.
                compactionConcurrency uint64
        }

        cfg         engineConfig
        db          *pebble.DB
        closed      bool
        auxDir      string
        ballastPath string
        properties  roachpb.StoreProperties

        // Stats updated by pebble.EventListener invocations, and returned in
        // GetMetrics. Updated and retrieved atomically.
        writeStallCount                  int64
        writeStallDuration               time.Duration
        writeStallStartNanos             int64
        diskSlowCount                    int64
        diskStallCount                   int64
        singleDelInvariantViolationCount int64
        singleDelIneffectualCount        int64
        sharedBytesRead                  int64
        sharedBytesWritten               int64
        iterStats                        struct {
                syncutil.Mutex
                AggregatedIteratorStats
        }
        batchCommitStats struct {
                syncutil.Mutex
                AggregatedBatchCommitStats
        }
        diskWriteStatsCollector *vfs.DiskWriteStatsCollector
        // Relevant options copied over from pebble.Options.
        logCtx        context.Context
        logger        pebble.LoggerAndTracer
        eventListener *pebble.EventListener
        mu            struct {
                // This mutex is the lowest in any lock ordering.
                syncutil.Mutex
                flushCompletedCallback func()
        }
        asyncDone sync.WaitGroup

        // minVersion is the minimum CockroachDB version that can open this store.
        minVersion roachpb.Version

        storeIDPebbleLog *base.StoreIDContainer
        replayer         *replay.WorkloadCollector
        diskSlowFunc     atomic.Pointer[func(vfs.DiskSlowInfo)]
        lowDiskSpaceFunc atomic.Pointer[func(pebble.LowDiskSpaceInfo)]

        singleDelLogEvery log.EveryN
}

// WorkloadCollector implements an workloadCollectorGetter and returns the
// workload collector stored on Pebble. This method is invoked following a
// successful cast of an Engine to a `workloadCollectorGetter` type. This method
// allows for pebble exclusive functionality to be used without modifying the
// Engine interface.
func (p *Pebble) WorkloadCollector() *replay.WorkloadCollector {
        return p.replayer
}

var _ Engine = &Pebble{}

// WorkloadCollectorEnabled specifies if the workload collector will be enabled
var WorkloadCollectorEnabled = envutil.EnvOrDefaultBool("COCKROACH_STORAGE_WORKLOAD_COLLECTOR", false)

// SetCompactionConcurrency will return the previous compaction concurrency.
func (p *Pebble) SetCompactionConcurrency(n uint64) uint64 {
        prevConcurrency := atomic.SwapUint64(&p.atomic.compactionConcurrency, n)
        return prevConcurrency
}

// RegisterDiskSlowCallback registers a callback that will be run when a write
// operation on the disk has been seen to be slow. Only one handler can be
// registered per Pebble instance.
func (p *Pebble) RegisterDiskSlowCallback(f func(vfs.DiskSlowInfo)) {
        p.diskSlowFunc.Store(&f)
}

// RegisterLowDiskSpaceCallback registers a callback that will be run when a a
// disk is running out of space. Only one handler can be registered per Pebble
// instance.
func (p *Pebble) RegisterLowDiskSpaceCallback(f func(info pebble.LowDiskSpaceInfo)) {
        p.lowDiskSpaceFunc.Store(&f)
}

// AdjustCompactionConcurrency adjusts the compaction concurrency up or down by
// the passed delta, down to a minimum of 1.
func (p *Pebble) AdjustCompactionConcurrency(delta int64) uint64 {
        for {
                current := atomic.LoadUint64(&p.atomic.compactionConcurrency)
                adjusted := int64(current) + delta
                if adjusted < 1 {
                        adjusted = 1
                }
                if atomic.CompareAndSwapUint64(&p.atomic.compactionConcurrency, current, uint64(adjusted)) {
                        return uint64(adjusted)
                }
        }
}

// SetStoreID adds the store id to pebble logs.
func (p *Pebble) SetStoreID(ctx context.Context, storeID int32) error {
        if p == nil {
                return nil
        }
        p.storeIDPebbleLog.Set(ctx, storeID)
        // Note that SetCreatorID only does something if remote storage is configured
        // in the pebble options.
        if storeID != base.TempStoreID {
                if err := p.db.SetCreatorID(uint64(storeID)); err != nil {
                        return err
                }
        }
        return nil
}

// GetStoreID returns to configured store ID.
func (p *Pebble) GetStoreID() (int32, error) {
        if p == nil {
                return 0, errors.AssertionFailedf("GetStoreID requires non-nil Pebble")
        }
        if p.storeIDPebbleLog == nil {
                return 0, errors.AssertionFailedf("GetStoreID requires an initialized store ID container")
        }
        storeID := p.storeIDPebbleLog.Get()
        if storeID == 0 {
                return 0, errors.AssertionFailedf("GetStoreID must be called after calling SetStoreID")
        }
        return storeID, nil
}

func (p *Pebble) Download(ctx context.Context, span roachpb.Span, copy bool) error {
        const copySpanName, rewriteSpanName = "pebble.Download", "pebble.DownloadRewrite"
        spanName := rewriteSpanName
        if copy {
                spanName = copySpanName
        }
        ctx, sp := tracing.ChildSpan(ctx, spanName)
        defer sp.Finish()
        if p == nil {
                return nil
        }
        downloadSpan := pebble.DownloadSpan{
                StartKey:               EncodeMVCCKey(MVCCKey{Key: span.Key}),
                EndKey:                 EncodeMVCCKey(MVCCKey{Key: span.EndKey}),
                ViaBackingFileDownload: copy,
        }
        return p.db.Download(ctx, []pebble.DownloadSpan{downloadSpan})
}

type remoteStorageAdaptor struct {
        p       *Pebble
        ctx     context.Context
        factory *cloud.EarlyBootExternalStorageAccessor
}

func (r remoteStorageAdaptor) CreateStorage(locator remote.Locator) (remote.Storage, error) {
        es, err := r.factory.OpenURL(r.ctx, string(locator))
        return &externalStorageWrapper{p: r.p, ctx: r.ctx, es: es}, err
}

// ConfigureForSharedStorage is used to configure a pebble Options for shared
// storage.
var ConfigureForSharedStorage func(opts *pebble.Options, storage remote.Storage) error

// newPebble creates a new Pebble instance, at the specified path.
// Do not use directly (except in test); use Open instead.
//
// Direct users of NewPebble: cfs.opts.{Logger,LoggerAndTracer} must not be
// set.
func newPebble(ctx context.Context, cfg engineConfig) (p *Pebble, err error) {
        if cfg.opts == nil {
                cfg.opts = DefaultPebbleOptions()
        } else {
                // Open also causes DefaultPebbleOptions before calling NewPebble, so we
                // are tolerant of Logger being set to pebble.DefaultLogger.
                if cfg.opts.Logger != nil && cfg.opts.Logger != pebble.DefaultLogger {
                        return nil, errors.AssertionFailedf("Options.Logger is set to unexpected value")
                }
                // Clone the given options so that we are free to modify them.
                cfg.opts = cfg.opts.Clone()
        }
        if cfg.opts.FormatMajorVersion < MinimumSupportedFormatVersion {
                return nil, errors.AssertionFailedf(
                        "FormatMajorVersion is %d, should be at least %d",
                        cfg.opts.FormatMajorVersion, MinimumSupportedFormatVersion,
                )
        }
        cfg.opts.FS = cfg.env
        cfg.opts.Lock = cfg.env.DirectoryLock
        cfg.opts.ErrorIfNotExists = cfg.mustExist
        for i := range cfg.opts.Levels {
                cfg.opts.Levels[i].Compression = func() block.Compression {
                        return getCompressionAlgorithm(ctx, cfg.settings, CompressionAlgorithmStorage)
                }
        }

        if cfg.opts.MaxConcurrentDownloads == nil {
                cfg.opts.MaxConcurrentDownloads = func() int {
                        return int(concurrentDownloadCompactions.Get(&cfg.settings.SV))
                }
        }

        cfg.opts.EnsureDefaults()

        // The context dance here is done so that we have a clean context without
        // timeouts that has a copy of the log tags.
        logCtx := logtags.WithTags(context.Background(), logtags.FromContext(ctx))
        // The store id, could not necessarily be determined when this function
        // is called. Therefore, we use a container for the store id.
        storeIDContainer := &base.StoreIDContainer{}
        logCtx = logtags.AddTag(logCtx, "s", storeIDContainer)
        logCtx = logtags.AddTag(logCtx, "pebble", nil)

        cfg.opts.Local.ReadaheadConfig = objstorageprovider.NewReadaheadConfig()
        updateReadaheadFn := func(ctx context.Context) {
                cfg.opts.Local.ReadaheadConfig.Set(
                        readaheadModeInformed.Get(&cfg.settings.SV),
                        readaheadModeSpeculative.Get(&cfg.settings.SV),
                )
        }
        updateReadaheadFn(context.Background())
        readaheadModeInformed.SetOnChange(&cfg.settings.SV, updateReadaheadFn)
        readaheadModeSpeculative.SetOnChange(&cfg.settings.SV, updateReadaheadFn)

        cfg.opts.WALMinSyncInterval = func() time.Duration {
                return minWALSyncInterval.Get(&cfg.settings.SV)
        }
        cfg.opts.Experimental.EnableValueBlocks = func() bool { return true }
        cfg.opts.Experimental.DisableIngestAsFlushable = func() bool {
                // Disable flushable ingests if shared storage is enabled. This is because
                // flushable ingests currently do not support Excise operations.
                //
                // TODO(bilal): Remove the first part of this || statement when
                // https://github.com/cockroachdb/pebble/issues/2676 is completed, or when
                // Pebble has better guards against this.
                return cfg.sharedStorage != nil || !IngestAsFlushable.Get(&cfg.settings.SV)
        }
        cfg.opts.Experimental.IngestSplit = func() bool {
                return IngestSplitEnabled.Get(&cfg.settings.SV)
        }
        cfg.opts.Experimental.EnableColumnarBlocks = func() bool {
                return ColumnarBlocksEnabled.Get(&cfg.settings.SV)
        }
        cfg.opts.Experimental.EnableDeleteOnlyCompactionExcises = func() bool {
                return deleteCompactionsCanExcise.Get(&cfg.settings.SV)
        }

        auxDir := cfg.opts.FS.PathJoin(cfg.env.Dir, base.AuxiliaryDir)
        if !cfg.env.IsReadOnly() {
                if err := cfg.opts.FS.MkdirAll(auxDir, 0755); err != nil {
                        return nil, err
                }
        }
        ballastPath := base.EmergencyBallastFile(cfg.env.PathJoin, cfg.env.Dir)

        if d := int64(cfg.blockConcurrencyLimitDivisor); d != 0 {
                val := (BlockLoadConcurrencyLimit.Get(&cfg.settings.SV) + d - 1) / d
                cfg.opts.LoadBlockSema = fifo.NewSemaphore(val)
                BlockLoadConcurrencyLimit.SetOnChange(&cfg.settings.SV, func(ctx context.Context) {
                        cfg.opts.LoadBlockSema.UpdateCapacity((BlockLoadConcurrencyLimit.Get(&cfg.settings.SV) + d - 1) / d)
                })
        }

        cfg.opts.Logger = nil // Defensive, since LoggerAndTracer will be used.
        if cfg.opts.LoggerAndTracer == nil {
                cfg.opts.LoggerAndTracer = pebbleLogger{
                        ctx:   logCtx,
                        depth: 1,
                }
        }
        // Else, already have a LoggerAndTracer. This only occurs in unit tests.

        // Establish the emergency ballast if we can. If there's not sufficient
        // disk space, the ballast will be reestablished from Capacity when the
        // store's capacity is queried periodically.
        if !cfg.opts.ReadOnly {
                du, err := cfg.env.UnencryptedFS.GetDiskUsage(cfg.env.Dir)
                // If the FS is an in-memory FS, GetDiskUsage returns
                // vfs.ErrUnsupported and we skip ballast creation.
                if err != nil && !errors.Is(err, vfs.ErrUnsupported) {
                        return nil, errors.Wrap(err, "retrieving disk usage")
                } else if err == nil {
                        resized, err := maybeEstablishBallast(cfg.env.UnencryptedFS, ballastPath, cfg.ballastSize, du)
                        if err != nil {
                                return nil, errors.Wrap(err, "resizing ballast")
                        }
                        if resized {
                                cfg.opts.LoggerAndTracer.Infof("resized ballast %s to size %s",
                                        ballastPath, humanizeutil.IBytes(cfg.ballastSize))
                        }
                }
        }

        p = &Pebble{
                cfg:                     cfg,
                auxDir:                  auxDir,
                ballastPath:             ballastPath,
                properties:              computeStoreProperties(ctx, cfg),
                logger:                  cfg.opts.LoggerAndTracer,
                logCtx:                  logCtx,
                storeIDPebbleLog:        storeIDContainer,
                replayer:                replay.NewWorkloadCollector(cfg.env.Dir),
                singleDelLogEvery:       log.Every(5 * time.Minute),
                diskWriteStatsCollector: cfg.DiskWriteStatsCollector,
        }

        // MaxConcurrentCompactions can be set by multiple sources, but all the
        // sources will eventually call NewPebble. So, we override
        // cfg.opts.MaxConcurrentCompactions to a closure which will return
        // Pebble.atomic.compactionConcurrency. This will allow us to both honor
        // the compactions concurrency which has already been set and allow us
        // to update the compactionConcurrency on the fly by changing the
        // Pebble.atomic.compactionConcurrency variable.
        p.atomic.compactionConcurrency = uint64(cfg.opts.MaxConcurrentCompactions())
        cfg.opts.MaxConcurrentCompactions = func() int {
                return int(atomic.LoadUint64(&p.atomic.compactionConcurrency))
        }

        // NB: The ordering of the event listeners passed to TeeEventListener is
        // deliberate. The listener returned by makeMetricEtcEventListener is
        // responsible for crashing the process if a DiskSlow event indicates the
        // disk is stalled. While the logging subsystem should also be robust to
        // stalls and crash the process if unable to write logs, there's less risk
        // to sequencing the crashing listener first.
        //
        // For the same reason, make the logging call asynchronous for DiskSlow events.
        // This prevents slow logging calls during a disk slow/stall event from holding
        // up Pebble's internal disk health checking, and better obeys the
        // EventListener contract for not having any functions block or take a while to
        // run. Creating goroutines is acceptable given their low cost, and the low
        // write concurrency to Pebble's FS (Pebble compactions + flushes + SQL
        // spilling to disk). If the maximum concurrency of DiskSlow events increases
        // significantly in the future, we can improve the logic here by queueing up
        // most of the logging work (except for the Fatalf call), and have it be done
        // by a single goroutine.
        // TODO(jackson): Refactor this indirection; there's no need the DiskSlow
        // callback needs to go through the EventListener and this structure is
        // confusing.
        cfg.env.RegisterOnDiskSlow(func(info pebble.DiskSlowInfo) {
                el := cfg.opts.EventListener
                p.async(func() { el.DiskSlow(info) })
        })
        el := pebble.TeeEventListener(
                p.makeMetricEtcEventListener(logCtx),
                pebble.MakeLoggingEventListener(pebbleLogger{
                        ctx:   logCtx,
                        depth: 2, // skip over the EventListener stack frame
                }),
        )

        p.eventListener = &el
        cfg.opts.EventListener = &el

        // If both cfg.sharedStorage and cfg.remoteStorageFactory are set, CRDB uses
        // cfg.sharedStorage. Note that eventually we will enable using both at the
        // same time, but we don't have the right abstractions in place to do that
        // today.
        //
        // We prefer cfg.sharedStorage, since the Locator -> Storage mapping contained
        // in it is needed for CRDB to function properly.
        if cfg.sharedStorage != nil {
                esWrapper := &externalStorageWrapper{p: p, es: cfg.sharedStorage, ctx: logCtx}
                if ConfigureForSharedStorage == nil {
                        return nil, errors.New("shared storage requires CCL features")
                }
                if err := ConfigureForSharedStorage(cfg.opts, esWrapper); err != nil {
                        return nil, errors.Wrap(err, "error when configuring shared storage")
                }
        } else {
                if cfg.remoteStorageFactory != nil {
                        cfg.opts.Experimental.RemoteStorage = remoteStorageAdaptor{p: p, ctx: logCtx, factory: cfg.remoteStorageFactory}
                }
        }

        // Read the current store cluster version.
        storeClusterVersion, minVerFileExists, err := getMinVersion(p.cfg.env.UnencryptedFS, cfg.env.Dir)
        if err != nil {
                return nil, err
        }
        if minVerFileExists {
                // Avoid running a binary too new for this store. This is what you'd catch
                // if, say, you restarted directly from v21.2 into v22.2 (bumping the min
                // version) without going through v22.1 first.
                //
                // Note that "going through" above means that v22.1 successfully upgrades
                // all existing stores. If v22.1 crashes half-way through the startup
                // sequence (so now some stores have v21.2, but others v22.1) you are
                // expected to run v22.1 again (hopefully without the crash this time) which
                // would then rewrite all the stores.
                if v := cfg.settings.Version; storeClusterVersion.Less(v.MinSupportedVersion()) {
                        if storeClusterVersion.Major < clusterversion.DevOffset && v.LatestVersion().Major >= clusterversion.DevOffset {
                                return nil, errors.Errorf(
                                        "store last used with cockroach non-development version v%s "+
                                                "cannot be opened by development version v%s",
                                        storeClusterVersion, v.LatestVersion(),
                                )
                        }
                        return nil, errors.Errorf(
                                "store last used with cockroach version v%s "+
                                        "is too old for running version v%s (which requires data from v%s or later)",
                                storeClusterVersion, v.LatestVersion(), v.MinSupportedVersion(),
                        )
                }
                cfg.opts.ErrorIfNotExists = true
        } else {
                if cfg.opts.ErrorIfNotExists || cfg.opts.ReadOnly {
                        // Make sure the message is not confusing if the store does exist but
                        // there is no min version file.
                        filename := p.cfg.env.UnencryptedFS.PathJoin(cfg.env.Dir, MinVersionFilename)
                        return nil, errors.Errorf(
                                "pebble: database %q does not exist (missing required file %q)",
                                cfg.env.Dir, filename,
                        )
                }
                // If there is no min version file, there should be no store. If there is
                // one, it's either 1) a store from a very old version (which we don't want
                // to open) or 2) an empty store that was created from a previous bootstrap
                // attempt that failed right before writing out the min version file. We set
                // a flag to disallow the open in case 1.
                cfg.opts.ErrorIfNotPristine = true
        }

        if WorkloadCollectorEnabled {
                p.replayer.Attach(cfg.opts)
        }

        db, err := pebble.Open(cfg.env.Dir, cfg.opts)
        if err != nil {
                // Decorate the errors caused by the flags we set above.
                if minVerFileExists && errors.Is(err, pebble.ErrDBDoesNotExist) {
                        err = errors.Wrap(err, "min version file exists but store doesn't")
                }
                if !minVerFileExists && errors.Is(err, pebble.ErrDBNotPristine) {
                        err = errors.Wrap(err, "store has no min-version file; this can "+
                                "happen if the store was created by an old CockroachDB version that is no "+
                                "longer supported")
                }
                return nil, err
        }
        p.db = db

        if !minVerFileExists {
                storeClusterVersion = cfg.settings.Version.ActiveVersionOrEmpty(ctx).Version
                if storeClusterVersion == (roachpb.Version{}) {
                        // If there is no active version, use the minimum supported version.
                        storeClusterVersion = cfg.settings.Version.MinSupportedVersion()
                }
        }

        // The storage engine performs its own internal migrations
        // through the setting of the store cluster version. When
        // storage's min version is set, SetMinVersion writes to disk to
        // commit to the new store cluster version. Then it idempotently
        // applies any internal storage engine migrations necessitated
        // or enabled by the new store cluster version. If we crash
        // after committing the new store cluster version but before
        // applying the internal migrations, we're left in an in-between
        // state.
        //
        // To account for this, after the engine is open,
        // unconditionally set the min cluster version again. If any
        // storage engine state has not been updated, the call to
        // SetMinVersion will update it.  If all storage engine state is
        // already updated, SetMinVersion is a noop.
        if err := p.SetMinVersion(storeClusterVersion); err != nil {
                p.Close()
                return nil, err
        }

        return p, nil
}

var userKeyCategories = pebble.MakeUserKeyCategories(
        EngineComparer.Compare,
        category("local-1", keys.LocalRangeIDPrefix.AsRawKey()),
        category("rangeid", keys.LocalRangeIDPrefix.AsRawKey().PrefixEnd()),
        category("local-2", keys.LocalRangePrefix),
        category("range", keys.LocalRangePrefix.PrefixEnd()),
        category("local-3", keys.LocalRangeLockTablePrefix),
        category("lock", keys.LocalRangeLockTablePrefix.PrefixEnd()),
        category("local-4", keys.LocalPrefix.PrefixEnd()),
        category("meta", keys.MetaMax),
        category("system", keys.SystemMax),
        category("tenant", nil),
)

func category(name string, upperBound roachpb.Key) pebble.UserKeyCategory {
        if upperBound == nil {
                return pebble.UserKeyCategory{Name: name}
        }
        ek := EngineKey{Key: upperBound}
        return pebble.UserKeyCategory{Name: name, UpperBound: ek.Encode()}
}

// async launches the provided function in a new goroutine. It uses a wait group
// to synchronize with (*Pebble).Close to ensure all launched goroutines have
// exited before Close returns.
func (p *Pebble) async(fn func()) {
        p.asyncDone.Add(1)
        go func() {
                defer p.asyncDone.Done()
                fn()
        }()
}

// writePreventStartupFile creates a file that will prevent nodes from automatically restarting after
// experiencing sstable corruption.
func (p *Pebble) writePreventStartupFile(ctx context.Context, corruptionError error) {
        auxDir := p.GetAuxiliaryDir()
        path := base.PreventedStartupFile(auxDir)

        preventStartupMsg := fmt.Sprintf(`ATTENTION:

  this node is terminating because of sstable corruption.
        Corruption may be a consequence of a hardware error.

        Error: %s

  A file preventing this node from restarting was placed at:
  %s`, corruptionError.Error(), path)

        if err := fs.WriteFile(p.cfg.env.UnencryptedFS, path, []byte(preventStartupMsg), fs.UnspecifiedWriteCategory); err != nil {
                log.Warningf(ctx, "%v", err)
        }
}

func (p *Pebble) makeMetricEtcEventListener(ctx context.Context) pebble.EventListener {
        return pebble.EventListener{
                BackgroundError: func(err error) {
                        if errors.Is(err, pebble.ErrCorruption) {
                                p.writePreventStartupFile(ctx, err)
                                log.Fatalf(ctx, "local corruption detected: %v", err)
                        }
                },
                WriteStallBegin: func(info pebble.WriteStallBeginInfo) {
                        atomic.AddInt64(&p.writeStallCount, 1)
                        startNanos := timeutil.Now().UnixNano()
                        atomic.StoreInt64(&p.writeStallStartNanos, startNanos)
                },
                WriteStallEnd: func() {
                        startNanos := atomic.SwapInt64(&p.writeStallStartNanos, 0)
                        if startNanos == 0 {
                                // Should not happen since these callbacks are registered when Pebble
                                // is opened, but just in case we miss the WriteStallBegin, lets not
                                // corrupt the metric.
                                return
                        }
                        stallDuration := timeutil.Now().UnixNano() - startNanos
                        if stallDuration < 0 {
                                return
                        }
                        atomic.AddInt64((*int64)(&p.writeStallDuration), stallDuration)
                },
                DiskSlow: func(info pebble.DiskSlowInfo) {
                        maxSyncDuration := fs.MaxSyncDuration.Get(&p.cfg.settings.SV)
                        fatalOnExceeded := fs.MaxSyncDurationFatalOnExceeded.Get(&p.cfg.settings.SV)
                        if info.Duration.Seconds() >= maxSyncDuration.Seconds() {
                                atomic.AddInt64(&p.diskStallCount, 1)
                                // Note that the below log messages go to the main cockroach log, not
                                // the pebble-specific log.
                                //
                                // Run non-fatal log.* calls in separate goroutines as they could block
                                // if the logging device is also slow/stalling, preventing pebble's disk
                                // health checking from functioning correctly. See the comment in
                                // pebble.EventListener on why it's important for this method to return
                                // quickly.
                                if fatalOnExceeded {
                                        // The write stall may prevent the process from exiting. If
                                        // the process won't exit, we can at least terminate all our
                                        // RPC connections first.
                                        //
                                        // See pkg/cli.runStart for where this function is hooked
                                        // up.
                                        log.MakeProcessUnavailable()

                                        if p.cfg.diskMonitor != nil {
                                                log.Fatalf(ctx, "disk stall detected: %s\n%s", info, p.cfg.diskMonitor.LogTrace())
                                        } else {
                                                log.Fatalf(ctx, "disk stall detected: %s", info)
                                        }
                                } else {
                                        if p.cfg.diskMonitor != nil {
                                                p.async(func() {
                                                        log.Errorf(ctx, "disk stall detected: %s\n%s", info, p.cfg.diskMonitor.LogTrace())
                                                })
                                        } else {
                                                p.async(func() { log.Errorf(ctx, "disk stall detected: %s", info) })
                                        }
                                }
                                return
                        }
                        atomic.AddInt64(&p.diskSlowCount, 1)
                        // Call any custom handlers registered for disk slowness.
                        if fn := p.diskSlowFunc.Load(); fn != nil {
                                (*fn)(info)
                        }
                },
                FlushEnd: func(info pebble.FlushInfo) {
                        if info.Err != nil {
                                return
                        }
                        p.mu.Lock()
                        cb := p.mu.flushCompletedCallback
                        p.mu.Unlock()
                        if cb != nil {
                                cb()
                        }
                },
                LowDiskSpace: func(info pebble.LowDiskSpaceInfo) {
                        if fn := p.lowDiskSpaceFunc.Load(); fn != nil {
                                (*fn)(info)
                        }
                },
                PossibleAPIMisuse: func(info pebble.PossibleAPIMisuseInfo) {
                        switch info.Kind {
                        case pebble.IneffectualSingleDelete:
                                if p.singleDelLogEvery.ShouldLog() {
                                        log.Infof(p.logCtx, "possible ineffectual SingleDel on key %s", roachpb.Key(info.UserKey))
                                }
                                atomic.AddInt64(&p.singleDelIneffectualCount, 1)

                        case pebble.NondeterministicSingleDelete:
                                if p.singleDelLogEvery.ShouldLog() {
                                        log.Infof(p.logCtx, "possible nondeterministic SingleDel on key %s", roachpb.Key(info.UserKey))
                                }
                                atomic.AddInt64(&p.singleDelInvariantViolationCount, 1)
                        }
                },
        }
}

// Env implements Engine.
func (p *Pebble) Env() *fs.Env { return p.cfg.env }

func (p *Pebble) String() string {
        dir := p.cfg.env.Dir
        if dir == "" {
                dir = "<in-mem>"
        }
        attrs := p.cfg.attrs.String()
        if attrs == "" {
                attrs = "<no-attributes>"
        }
        return fmt.Sprintf("%s=%s", attrs, dir)
}

// Close implements the Engine interface.
func (p *Pebble) Close() {
        if p.closed {
                p.logger.Infof("closing unopened pebble instance")
                return
        }
        for _, closeFunc := range p.cfg.beforeClose {
                closeFunc(p)
        }

        p.closed = true

        // Wait for any asynchronous goroutines to exit.
        p.asyncDone.Wait()

        handleErr := func(err error) {
                if err == nil {
                        return
                }
                // Allow unclean close in production builds for now. We refrain from
                // Fatal-ing on an unclean close because Cockroach opens and closes
                // ephemeral engines at time, and an error in those codepaths should not
                // fatal the process.
                //
                // TODO(jackson): Propagate the error to call sites without fataling:
                // This is tricky, because the Reader interface requires Close return
                // nothing.
                if buildutil.CrdbTestBuild {
                        log.Fatalf(p.logCtx, "error during engine close: %s\n", err)
                } else {
                        log.Errorf(p.logCtx, "error during engine close: %s\n", err)
                }
        }

        handleErr(p.db.Close())
        if p.cfg.env != nil {
                p.cfg.env.Close()
                p.cfg.env = nil
        }
        if p.cfg.diskMonitor != nil {
                p.cfg.diskMonitor.Close()
                p.cfg.diskMonitor = nil
        }
        for _, closeFunc := range p.cfg.afterClose {
                closeFunc()
        }
}

// aggregateIterStats is propagated to all of an engine's iterators, aggregating
// iterator stats when an iterator is closed or its stats are reset. These
// aggregated stats are exposed through GetMetrics.
func (p *Pebble) aggregateIterStats(stats IteratorStats) {
        p.iterStats.Lock()
        defer p.iterStats.Unlock()
        p.iterStats.BlockBytes += stats.Stats.InternalStats.BlockBytes
        p.iterStats.BlockBytesInCache += stats.Stats.InternalStats.BlockBytesInCache
        p.iterStats.BlockReadDuration += stats.Stats.InternalStats.BlockReadDuration
        p.iterStats.ExternalSeeks += stats.Stats.ForwardSeekCount[pebble.InterfaceCall] + stats.Stats.ReverseSeekCount[pebble.InterfaceCall]
        p.iterStats.ExternalSteps += stats.Stats.ForwardStepCount[pebble.InterfaceCall] + stats.Stats.ReverseStepCount[pebble.InterfaceCall]
        p.iterStats.InternalSeeks += stats.Stats.ForwardSeekCount[pebble.InternalIterCall] + stats.Stats.ReverseSeekCount[pebble.InternalIterCall]
        p.iterStats.InternalSteps += stats.Stats.ForwardStepCount[pebble.InternalIterCall] + stats.Stats.ReverseStepCount[pebble.InternalIterCall]
}

func (p *Pebble) aggregateBatchCommitStats(stats BatchCommitStats) {
        p.batchCommitStats.Lock()
        p.batchCommitStats.Count++
        p.batchCommitStats.TotalDuration += stats.TotalDuration
        p.batchCommitStats.SemaphoreWaitDuration += stats.SemaphoreWaitDuration
        p.batchCommitStats.WALQueueWaitDuration += stats.WALQueueWaitDuration
        p.batchCommitStats.MemTableWriteStallDuration += stats.MemTableWriteStallDuration
        p.batchCommitStats.L0ReadAmpWriteStallDuration += stats.L0ReadAmpWriteStallDuration
        p.batchCommitStats.WALRotationDuration += stats.WALRotationDuration
        p.batchCommitStats.CommitWaitDuration += stats.CommitWaitDuration
        p.batchCommitStats.Unlock()
}

// Closed implements the Engine interface.
func (p *Pebble) Closed() bool {
        return p.closed
}

// MVCCIterate implements the Engine interface.
func (p *Pebble) MVCCIterate(
        ctx context.Context,
        start, end roachpb.Key,
        iterKind MVCCIterKind,
        keyTypes IterKeyType,
        readCategory fs.ReadCategory,
        f func(MVCCKeyValue, MVCCRangeKeyStack) error,
) error {
        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                err := iterateOnReader(ctx, r, start, end, iterKind, keyTypes, readCategory, f)
                r.Free()
                return err
        }
        return iterateOnReader(ctx, p, start, end, iterKind, keyTypes, readCategory, f)
}

// NewMVCCIterator implements the Engine interface.
func (p *Pebble) NewMVCCIterator(
        ctx context.Context, iterKind MVCCIterKind, opts IterOptions,
) (MVCCIterator, error) {
        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                iter, err := r.NewMVCCIterator(ctx, iterKind, opts)
                r.Free()
                if err != nil {
                        return nil, err
                }
                return maybeWrapInUnsafeIter(iter), nil
        }

        iter, err := newPebbleIterator(ctx, p.db, opts, StandardDurability, p)
        if err != nil {
                return nil, err
        }
        return maybeWrapInUnsafeIter(iter), nil
}

// NewEngineIterator implements the Engine interface.
func (p *Pebble) NewEngineIterator(ctx context.Context, opts IterOptions) (EngineIterator, error) {
        return newPebbleIterator(ctx, p.db, opts, StandardDurability, p)
}

// ScanInternal implements the Engine interface.
func (p *Pebble) ScanInternal(
        ctx context.Context,
        lower, upper roachpb.Key,
        visitPointKey func(key *pebble.InternalKey, value pebble.LazyValue, info pebble.IteratorLevel) error,
        visitRangeDel func(start []byte, end []byte, seqNum pebble.SeqNum) error,
        visitRangeKey func(start []byte, end []byte, keys []rangekey.Key) error,
        visitSharedFile func(sst *pebble.SharedSSTMeta) error,
        visitExternalFile func(sst *pebble.ExternalFile) error,
) error {
        rawLower := EngineKey{Key: lower}.Encode()
        rawUpper := EngineKey{Key: upper}.Encode()
        // TODO(sumeer): set category.
        return p.db.ScanInternal(ctx, block.CategoryUnknown, rawLower, rawUpper, visitPointKey,
                visitRangeDel, visitRangeKey, visitSharedFile, visitExternalFile)
}

// ConsistentIterators implements the Engine interface.
func (p *Pebble) ConsistentIterators() bool {
        return false
}

// PinEngineStateForIterators implements the Engine interface.
func (p *Pebble) PinEngineStateForIterators(fs.ReadCategory) error {
        return errors.AssertionFailedf(
                "PinEngineStateForIterators must not be called when ConsistentIterators returns false")
}

// ApplyBatchRepr implements the Engine interface.
func (p *Pebble) ApplyBatchRepr(repr []byte, sync bool) error {
        // batch.SetRepr takes ownership of the underlying slice, so make a copy.
        reprCopy := make([]byte, len(repr))
        copy(reprCopy, repr)

        batch := p.db.NewBatch()
        if err := batch.SetRepr(reprCopy); err != nil {
                return err
        }

        opts := pebble.NoSync
        if sync {
                opts = pebble.Sync
        }
        return batch.Commit(opts)
}

// ClearMVCC implements the Engine interface.
func (p *Pebble) ClearMVCC(key MVCCKey, opts ClearOptions) error {
        if key.Timestamp.IsEmpty() {
                panic("ClearMVCC timestamp is empty")
        }
        return p.clear(key, opts)
}

// ClearUnversioned implements the Engine interface.
func (p *Pebble) ClearUnversioned(key roachpb.Key, opts ClearOptions) error {
        return p.clear(MVCCKey{Key: key}, opts)
}

// ClearEngineKey implements the Engine interface.
func (p *Pebble) ClearEngineKey(key EngineKey, opts ClearOptions) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        if !opts.ValueSizeKnown {
                return p.db.Delete(key.Encode(), pebble.Sync)
        }
        return p.db.DeleteSized(key.Encode(), opts.ValueSize, pebble.Sync)
}

func (p *Pebble) clear(key MVCCKey, opts ClearOptions) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        if !opts.ValueSizeKnown {
                return p.db.Delete(EncodeMVCCKey(key), pebble.Sync)
        }
        // Use DeleteSized to propagate the value size.
        return p.db.DeleteSized(EncodeMVCCKey(key), opts.ValueSize, pebble.Sync)
}

// SingleClearEngineKey implements the Engine interface.
func (p *Pebble) SingleClearEngineKey(key EngineKey) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        return p.db.SingleDelete(key.Encode(), pebble.Sync)
}

// ClearRawRange implements the Engine interface.
func (p *Pebble) ClearRawRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error {
        startRaw, endRaw := EngineKey{Key: start}.Encode(), EngineKey{Key: end}.Encode()
        if pointKeys {
                if err := p.db.DeleteRange(startRaw, endRaw, pebble.Sync); err != nil {
                        return err
                }
        }
        if rangeKeys {
                if err := p.db.RangeKeyDelete(startRaw, endRaw, pebble.Sync); err != nil {
                        return err
                }
        }
        return nil
}

// ClearMVCCRange implements the Engine interface.
func (p *Pebble) ClearMVCCRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error {
        // Write all the tombstones in one batch.
        batch := p.NewUnindexedBatch()
        defer batch.Close()

        if err := batch.ClearMVCCRange(start, end, pointKeys, rangeKeys); err != nil {
                return err
        }
        return batch.Commit(true)
}

// ClearMVCCVersions implements the Engine interface.
func (p *Pebble) ClearMVCCVersions(start, end MVCCKey) error {
        return p.db.DeleteRange(EncodeMVCCKey(start), EncodeMVCCKey(end), pebble.Sync)
}

// ClearMVCCIteratorRange implements the Engine interface.
func (p *Pebble) ClearMVCCIteratorRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error {
        // Write all the tombstones in one batch.
        batch := p.NewUnindexedBatch()
        defer batch.Close()

        if err := batch.ClearMVCCIteratorRange(start, end, pointKeys, rangeKeys); err != nil {
                return err
        }
        return batch.Commit(true)
}

// ClearMVCCRangeKey implements the Engine interface.
func (p *Pebble) ClearMVCCRangeKey(rangeKey MVCCRangeKey) error {
        if err := rangeKey.Validate(); err != nil {
                return err
        }
        // If the range key holds an encoded timestamp as it was read from storage,
        // write the tombstone to clear it using the same encoding of the timestamp.
        // See #129592.
        if len(rangeKey.EncodedTimestampSuffix) > 0 {
                return p.ClearEngineRangeKey(
                        rangeKey.StartKey, rangeKey.EndKey, rangeKey.EncodedTimestampSuffix)
        }
        return p.ClearEngineRangeKey(
                rangeKey.StartKey, rangeKey.EndKey, EncodeMVCCTimestampSuffix(rangeKey.Timestamp))
}

// PutMVCCRangeKey implements the Engine interface.
func (p *Pebble) PutMVCCRangeKey(rangeKey MVCCRangeKey, value MVCCValue) error {
        // NB: all MVCC APIs currently assume all range keys are range tombstones.
        if !value.IsTombstone() {
                return errors.New("range keys can only be MVCC range tombstones")
        }
        valueRaw, err := EncodeMVCCValue(value)
        if err != nil {
                return errors.Wrapf(err, "failed to encode MVCC value for range key %s", rangeKey)
        }
        return p.PutRawMVCCRangeKey(rangeKey, valueRaw)
}

// PutRawMVCCRangeKey implements the Engine interface.
func (p *Pebble) PutRawMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error {
        if err := rangeKey.Validate(); err != nil {
                return err
        }
        return p.PutEngineRangeKey(
                rangeKey.StartKey, rangeKey.EndKey, EncodeMVCCTimestampSuffix(rangeKey.Timestamp), value)
}

// Merge implements the Engine interface.
func (p *Pebble) Merge(key MVCCKey, value []byte) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        return p.db.Merge(EncodeMVCCKey(key), value, pebble.Sync)
}

// PutMVCC implements the Engine interface.
func (p *Pebble) PutMVCC(key MVCCKey, value MVCCValue) error {
        if key.Timestamp.IsEmpty() {
                panic("PutMVCC timestamp is empty")
        }
        encValue, err := EncodeMVCCValue(value)
        if err != nil {
                return err
        }
        return p.put(key, encValue)
}

// PutRawMVCC implements the Engine interface.
func (p *Pebble) PutRawMVCC(key MVCCKey, value []byte) error {
        if key.Timestamp.IsEmpty() {
                panic("PutRawMVCC timestamp is empty")
        }
        return p.put(key, value)
}

// PutUnversioned implements the Engine interface.
func (p *Pebble) PutUnversioned(key roachpb.Key, value []byte) error {
        return p.put(MVCCKey{Key: key}, value)
}

// PutEngineKey implements the Engine interface.
func (p *Pebble) PutEngineKey(key EngineKey, value []byte) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        return p.db.Set(key.Encode(), value, pebble.Sync)
}

func (p *Pebble) put(key MVCCKey, value []byte) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        return p.db.Set(EncodeMVCCKey(key), value, pebble.Sync)
}

// PutEngineRangeKey implements the Engine interface.
func (p *Pebble) PutEngineRangeKey(start, end roachpb.Key, suffix, value []byte) error {
        return p.db.RangeKeySet(
                EngineKey{Key: start}.Encode(), EngineKey{Key: end}.Encode(), suffix, value, pebble.Sync)
}

// ClearEngineRangeKey implements the Engine interface.
func (p *Pebble) ClearEngineRangeKey(start, end roachpb.Key, suffix []byte) error {
        return p.db.RangeKeyUnset(
                EngineKey{Key: start}.Encode(), EngineKey{Key: end}.Encode(), suffix, pebble.Sync)
}

// LogData implements the Engine interface.
func (p *Pebble) LogData(data []byte) error {
        return p.db.LogData(data, pebble.Sync)
}

// LogLogicalOp implements the Engine interface.
func (p *Pebble) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
        // No-op. Logical logging disabled.
}

// LocalTimestampsEnabled controls whether local timestamps are written in MVCC
// values. A true setting is also gated on clusterversion.LocalTimestamps. After
// all nodes in a cluster are at or beyond clusterversion.LocalTimestamps,
// different nodes will see the version state transition at different times.
// Nodes that have not yet seen the transition may remove the local timestamp
// from an intent that has one during intent resolution. This will not cause
// problems.
//
// TODO(nvanbenschoten): remove this cluster setting and its associated plumbing
// when removing the cluster version, once we're confident in the efficacy and
// stability of local timestamps.
var LocalTimestampsEnabled = settings.RegisterBoolSetting(
        settings.SystemOnly,
        "storage.transaction.local_timestamps.enabled",
        "if enabled, MVCC keys will be written with local timestamps",
        true,
)

func shouldWriteLocalTimestamps(ctx context.Context, settings *cluster.Settings) bool {
        return LocalTimestampsEnabled.Get(&settings.SV)
}

// ShouldWriteLocalTimestamps implements the Writer interface.
func (p *Pebble) ShouldWriteLocalTimestamps(ctx context.Context) bool {
        // This is not fast. Pebble should not be used by writers that want
        // performance. They should use pebbleBatch.
        return shouldWriteLocalTimestamps(ctx, p.cfg.settings)
}

// Attrs implements the Engine interface.
func (p *Pebble) Attrs() roachpb.Attributes {
        return p.cfg.attrs
}

// Properties implements the Engine interface.
func (p *Pebble) Properties() roachpb.StoreProperties {
        return p.properties
}

// Capacity implements the Engine interface.
func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) {
        dir := p.cfg.env.Dir
        if dir != "" {
                var err error
                // Eval directory if it is a symbolic links.
                if dir, err = filepath.EvalSymlinks(dir); err != nil {
                        return roachpb.StoreCapacity{}, err
                }
        }
        du, err := p.cfg.env.UnencryptedFS.GetDiskUsage(dir)
        if errors.Is(err, vfs.ErrUnsupported) {
                // This is an in-memory instance. Pretend we're empty since we
                // don't know better and only use this for testing. Using any
                // part of the actual file system here can throw off allocator
                // rebalancing in a hard-to-trace manner. See #7050.
                return roachpb.StoreCapacity{
                        Capacity:  p.cfg.maxSize.bytes,
                        Available: p.cfg.maxSize.bytes,
                }, nil
        } else if err != nil {
                return roachpb.StoreCapacity{}, err
        }

        if du.TotalBytes > math.MaxInt64 {
                return roachpb.StoreCapacity{}, fmt.Errorf("unsupported disk size %s, max supported size is %s",
                        humanize.IBytes(du.TotalBytes), humanizeutil.IBytes(math.MaxInt64))
        }
        if du.AvailBytes > math.MaxInt64 {
                return roachpb.StoreCapacity{}, fmt.Errorf("unsupported disk size %s, max supported size is %s",
                        humanize.IBytes(du.AvailBytes), humanizeutil.IBytes(math.MaxInt64))
        }
        fsuTotal := int64(du.TotalBytes)
        fsuAvail := int64(du.AvailBytes)

        // If the emergency ballast isn't appropriately sized, try to resize it.
        // This is a no-op if the ballast is already sized or if there's not
        // enough available capacity to resize it. Capacity is called periodically
        // by the kvserver, and that drives the automatic resizing of the ballast.
        if !p.cfg.env.IsReadOnly() {
                resized, err := maybeEstablishBallast(p.cfg.env.UnencryptedFS, p.ballastPath, p.cfg.ballastSize, du)
                if err != nil {
                        return roachpb.StoreCapacity{}, errors.Wrap(err, "resizing ballast")
                }
                if resized {
                        p.logger.Infof("resized ballast %s to size %s",
                                p.ballastPath, humanizeutil.IBytes(p.cfg.ballastSize))
                        du, err = p.cfg.env.UnencryptedFS.GetDiskUsage(dir)
                        if err != nil {
                                return roachpb.StoreCapacity{}, err
                        }
                }
        }

        // Pebble has detailed accounting of its own disk space usage, and it's
        // incrementally updated which helps avoid O(# files) work here.
        m := p.db.Metrics()
        totalUsedBytes := int64(m.DiskSpaceUsage())

        // We don't have incremental accounting of the disk space usage of files
        // in the auxiliary directory. Walk the auxiliary directory and all its
        // subdirectories, adding to the total used bytes.
        if errOuter := filepath.Walk(p.auxDir, func(path string, info os.FileInfo, err error) error {
                if err != nil {
                        // This can happen if CockroachDB removes files out from under us -
                        // just keep going to get the best estimate we can.
                        if oserror.IsNotExist(err) {
                                return nil
                        }
                        // Special-case: if the store-dir is configured using the root of some fs,
                        // e.g. "/mnt/db", we might have special fs-created files like lost+found
                        // that we can't read, so just ignore them rather than crashing.
                        if oserror.IsPermission(err) && filepath.Base(path) == "lost+found" {
                                return nil
                        }
                        return err
                }
                if path == p.ballastPath {
                        // Skip the ballast. Counting it as used is likely to confuse
                        // users, and it's more akin to space that is just unavailable
                        // like disk space often restricted to a root user.
                        return nil
                }
                if info.Mode().IsRegular() {
                        totalUsedBytes += info.Size()
                }
                return nil
        }); errOuter != nil {
                return roachpb.StoreCapacity{}, errOuter
        }

        // If no size limitation have been placed on the store size or if the
        // limitation is greater than what's available, just return the actual
        // totals.
        if ((p.cfg.maxSize.bytes == 0 || p.cfg.maxSize.bytes >= fsuTotal) && p.cfg.maxSize.percent == 0) || p.cfg.env.Dir == "" {
                return roachpb.StoreCapacity{
                        Capacity:  fsuTotal,
                        Available: fsuAvail,
                        Used:      totalUsedBytes,
                }, nil
        }

        maxSize := p.cfg.maxSize.bytes
        if p.cfg.maxSize.percent != 0 {
                maxSize = int64(float64(fsuTotal) * p.cfg.maxSize.percent)
        }

        available := maxSize - totalUsedBytes
        if available > fsuAvail {
                available = fsuAvail
        }
        if available < 0 {
                available = 0
        }

        return roachpb.StoreCapacity{
                Capacity:  maxSize,
                Available: available,
                Used:      totalUsedBytes,
        }, nil
}

// Flush implements the Engine interface.
func (p *Pebble) Flush() error {
        return p.db.Flush()
}

// GetMetrics implements the Engine interface.
func (p *Pebble) GetMetrics() Metrics {
        m := Metrics{
                Metrics:                          p.db.Metrics(),
                WriteStallCount:                  atomic.LoadInt64(&p.writeStallCount),
                WriteStallDuration:               time.Duration(atomic.LoadInt64((*int64)(&p.writeStallDuration))),
                DiskSlowCount:                    atomic.LoadInt64(&p.diskSlowCount),
                DiskStallCount:                   atomic.LoadInt64(&p.diskStallCount),
                SingleDelInvariantViolationCount: atomic.LoadInt64(&p.singleDelInvariantViolationCount),
                SingleDelIneffectualCount:        atomic.LoadInt64(&p.singleDelIneffectualCount),
                SharedStorageReadBytes:           atomic.LoadInt64(&p.sharedBytesRead),
                SharedStorageWriteBytes:          atomic.LoadInt64(&p.sharedBytesWritten),
        }
        if sema := p.cfg.opts.LoadBlockSema; sema != nil {
                semaStats := sema.Stats()
                m.BlockLoadConcurrencyLimit = semaStats.Capacity
                m.BlockLoadsInProgress = semaStats.Outstanding
                m.BlockLoadsQueued = semaStats.NumHadToWait
        }
        p.iterStats.Lock()
        m.Iterator = p.iterStats.AggregatedIteratorStats
        p.iterStats.Unlock()
        p.batchCommitStats.Lock()
        m.BatchCommitStats = p.batchCommitStats.AggregatedBatchCommitStats
        p.batchCommitStats.Unlock()
        if p.diskWriteStatsCollector != nil {
                m.DiskWriteStats = p.diskWriteStatsCollector.GetStats()
        }
        return m
}

// GetPebbleOptions implements the Engine interface.
func (p *Pebble) GetPebbleOptions() *pebble.Options {
        return p.cfg.opts
}

// GetEncryptionRegistries implements the Engine interface.
func (p *Pebble) GetEncryptionRegistries() (*fs.EncryptionRegistries, error) {
        rv := &fs.EncryptionRegistries{}
        var err error
        if p.cfg.env.Encryption != nil {
                rv.KeyRegistry, err = p.cfg.env.Encryption.StatsHandler.GetDataKeysRegistry()
                if err != nil {
                        return nil, err
                }
        }
        if p.cfg.env.Registry != nil {
                rv.FileRegistry, err = protoutil.Marshal(p.cfg.env.Registry.GetRegistrySnapshot())
                if err != nil {
                        return nil, err
                }
        }
        return rv, nil
}

// GetEnvStats implements the Engine interface.
func (p *Pebble) GetEnvStats() (*fs.EnvStats, error) {
        // TODO(sumeer): make the stats complete. There are no bytes stats. The TotalFiles is missing
        // files that are not in the registry (from before encryption was enabled).
        stats := &fs.EnvStats{}
        if p.cfg.env.Encryption == nil {
                return stats, nil
        }
        stats.EncryptionType = p.cfg.env.Encryption.StatsHandler.GetActiveStoreKeyType()
        var err error
        stats.EncryptionStatus, err = p.cfg.env.Encryption.StatsHandler.GetEncryptionStatus()
        if err != nil {
                return nil, err
        }
        fr := p.cfg.env.Registry.GetRegistrySnapshot()
        activeKeyID, err := p.cfg.env.Encryption.StatsHandler.GetActiveDataKeyID()
        if err != nil {
                return nil, err
        }

        m := p.db.Metrics()
        stats.TotalFiles = 3 /* CURRENT, MANIFEST, OPTIONS */
        stats.TotalFiles += uint64(m.WAL.Files + m.Table.ZombieCount + m.WAL.ObsoleteFiles + m.Table.ObsoleteCount)
        stats.TotalBytes = m.WAL.Size + m.Table.ZombieSize + m.Table.ObsoleteSize
        for _, l := range m.Levels {
                stats.TotalFiles += uint64(l.NumFiles)
                stats.TotalBytes += uint64(l.Size)
        }

        sstSizes := make(map[pebble.FileNum]uint64)
        sstInfos, err := p.db.SSTables()
        if err != nil {
                return nil, err
        }
        for _, ssts := range sstInfos {
                for _, sst := range ssts {
                        sstSizes[sst.FileNum] = sst.Size
                }
        }

        for filePath, entry := range fr.Files {
                keyID, err := p.cfg.env.Encryption.StatsHandler.GetKeyIDFromSettings(entry.EncryptionSettings)
                if err != nil {
                        return nil, err
                }
                if len(keyID) == 0 {
                        keyID = "plain"
                }
                if keyID != activeKeyID {
                        continue
                }
                stats.ActiveKeyFiles++

                filename := p.cfg.env.PathBase(filePath)
                numStr := strings.TrimSuffix(filename, ".sst")
                if len(numStr) == len(filename) {
                        continue // not a sstable
                }
                u, err := strconv.ParseUint(numStr, 10, 64)
                if err != nil {
                        return nil, errors.Wrapf(err, "parsing filename %q", errors.Safe(filename))
                }
                stats.ActiveKeyBytes += sstSizes[pebble.FileNum(u)]
        }

        // Ensure that encryption percentage does not exceed 100%.
        frFileLen := uint64(len(fr.Files))
        if stats.TotalFiles < frFileLen {
                stats.TotalFiles = frFileLen
        }

        if stats.TotalBytes < stats.ActiveKeyBytes {
                stats.TotalBytes = stats.ActiveKeyBytes
        }

        return stats, nil
}

// GetAuxiliaryDir implements the Engine interface.
func (p *Pebble) GetAuxiliaryDir() string {
        return p.auxDir
}

// NewBatch implements the Engine interface.
func (p *Pebble) NewBatch() Batch {
        return newPebbleBatch(p.db, p.db.NewIndexedBatch(), p.cfg.settings, p, p)
}

// NewReader implements the Engine interface.
func (p *Pebble) NewReader(durability DurabilityRequirement) Reader {
        return newPebbleReadOnly(p, durability)
}

// NewReadOnly implements the Engine interface.
func (p *Pebble) NewReadOnly(durability DurabilityRequirement) ReadWriter {
        return newPebbleReadOnly(p, durability)
}

// NewUnindexedBatch implements the Engine interface.
func (p *Pebble) NewUnindexedBatch() Batch {
        return newPebbleBatch(p.db, p.db.NewBatch(), p.cfg.settings, p, p)
}

// NewWriteBatch implements the Engine interface.
func (p *Pebble) NewWriteBatch() WriteBatch {
        return newWriteBatch(p.db, p.db.NewBatch(), p.cfg.settings, p, p)
}

// NewSnapshot implements the Engine interface.
func (p *Pebble) NewSnapshot() Reader {
        return &pebbleSnapshot{
                snapshot: p.db.NewSnapshot(),
                parent:   p,
        }
}

// NewEventuallyFileOnlySnapshot implements the Engine interface.
func (p *Pebble) NewEventuallyFileOnlySnapshot(keyRanges []roachpb.Span) EventuallyFileOnlyReader {
        engineKeyRanges := make([]pebble.KeyRange, len(keyRanges))
        for i := range keyRanges {
                engineKeyRanges[i].Start = EngineKey{Key: keyRanges[i].Key}.Encode()
                engineKeyRanges[i].End = EngineKey{Key: keyRanges[i].EndKey}.Encode()
        }
        efos := p.db.NewEventuallyFileOnlySnapshot(engineKeyRanges)
        return &pebbleEFOS{
                efos:      efos,
                parent:    p,
                keyRanges: keyRanges,
        }
}

// IngestLocalFiles implements the Engine interface.
func (p *Pebble) IngestLocalFiles(ctx context.Context, paths []string) error {
        return p.db.Ingest(ctx, paths)
}

// IngestLocalFilesWithStats implements the Engine interface.
func (p *Pebble) IngestLocalFilesWithStats(
        ctx context.Context, paths []string,
) (pebble.IngestOperationStats, error) {
        return p.db.IngestWithStats(ctx, paths)
}

// IngestAndExciseFiles implements the Engine interface.
func (p *Pebble) IngestAndExciseFiles(
        ctx context.Context,
        paths []string,
        shared []pebble.SharedSSTMeta,
        external []pebble.ExternalFile,
        exciseSpan roachpb.Span,
        sstsContainExciseTombstone bool,
) (pebble.IngestOperationStats, error) {
        rawSpan := pebble.KeyRange{
                Start: EngineKey{Key: exciseSpan.Key}.Encode(),
                End:   EngineKey{Key: exciseSpan.EndKey}.Encode(),
        }
        return p.db.IngestAndExcise(ctx, paths, shared, external, rawSpan)
}

// IngestExternalFiles implements the Engine interface.
func (p *Pebble) IngestExternalFiles(
        ctx context.Context, external []pebble.ExternalFile,
) (pebble.IngestOperationStats, error) {
        return p.db.IngestExternalFiles(ctx, external)
}

// PreIngestDelay implements the Engine interface.
func (p *Pebble) PreIngestDelay(ctx context.Context) {
        preIngestDelay(ctx, p, p.cfg.settings)
}

// GetTableMetrics implements the Engine interface.
func (p *Pebble) GetTableMetrics(start, end roachpb.Key) ([]enginepb.SSTableMetricsInfo, error) {
        filterOpt := pebble.WithKeyRangeFilter(
                EncodeMVCCKey(MVCCKey{Key: start}),
                EncodeMVCCKey(MVCCKey{Key: end}),
        )
        tableInfo, err := p.db.SSTables(filterOpt, pebble.WithProperties(), pebble.WithApproximateSpanBytes())
        if err != nil {
                return []enginepb.SSTableMetricsInfo{}, err
        }

        var totalTables int
        for _, info := range tableInfo {
                totalTables += len(info)
        }

        var metricsInfo []enginepb.SSTableMetricsInfo

        for level, sstableInfos := range tableInfo {
                for _, sstableInfo := range sstableInfos {
                        marshalTableInfo, err := json.Marshal(sstableInfo)
                        if err != nil {
                                return []enginepb.SSTableMetricsInfo{}, err
                        }
                        metricsInfo = append(metricsInfo, enginepb.SSTableMetricsInfo{
                                Level:                int32(level),
                                TableID:              uint64(sstableInfo.TableInfo.FileNum),
                                TableInfoJSON:        marshalTableInfo,
                                ApproximateSpanBytes: sstableInfo.ApproximateSpanBytes,
                        })
                }
        }
        return metricsInfo, nil
}

// ScanStorageInternalKeys implements the Engine interface.
func (p *Pebble) ScanStorageInternalKeys(
        start, end roachpb.Key, megabytesPerSecond int64,
) ([]enginepb.StorageInternalKeysMetrics, error) {
        stats, err := p.db.ScanStatistics(context.TODO(), start, end, pebble.ScanStatisticsOptions{LimitBytesPerSecond: 1000000 * megabytesPerSecond})
        if err != nil {
                return []enginepb.StorageInternalKeysMetrics{}, err
        }
        setMetricsFromStats := func(
                level int, stats *pebble.KeyStatistics, m *enginepb.StorageInternalKeysMetrics) {
                *m = enginepb.StorageInternalKeysMetrics{
                        Level:                       int32(level),
                        SnapshotPinnedKeys:          uint64(stats.SnapshotPinnedKeys),
                        SnapshotPinnedKeysBytes:     stats.SnapshotPinnedKeysBytes,
                        PointKeyDeleteCount:         uint64(stats.KindsCount[pebble.InternalKeyKindDelete]),
                        PointKeySetCount:            uint64(stats.KindsCount[pebble.InternalKeyKindSet]),
                        RangeDeleteCount:            uint64(stats.KindsCount[pebble.InternalKeyKindRangeDelete]),
                        RangeKeySetCount:            uint64(stats.KindsCount[pebble.InternalKeyKindRangeKeySet]),
                        RangeKeyDeleteCount:         uint64(stats.KindsCount[pebble.InternalKeyKindRangeKeyDelete]),
                        PointKeyDeleteIsLatestCount: uint64(stats.LatestKindsCount[pebble.InternalKeyKindDelete]),
                        PointKeySetIsLatestCount:    uint64(stats.LatestKindsCount[pebble.InternalKeyKindSet]),
                }
        }
        var metrics []enginepb.StorageInternalKeysMetrics
        for level := 0; level < 7; level++ {
                var m enginepb.StorageInternalKeysMetrics
                setMetricsFromStats(level, &stats.Levels[level], &m)
                metrics = append(metrics, m)
        }
        var m enginepb.StorageInternalKeysMetrics
        setMetricsFromStats(-1 /* level */, &stats.Accumulated, &m)
        metrics = append(metrics, m)
        return metrics, nil
}

// ApproximateDiskBytes implements the Engine interface.
func (p *Pebble) ApproximateDiskBytes(
        from, to roachpb.Key,
) (bytes, remoteBytes, externalBytes uint64, _ error) {
        fromEncoded := EngineKey{Key: from}.Encode()
        toEncoded := EngineKey{Key: to}.Encode()
        bytes, remoteBytes, externalBytes, err := p.db.EstimateDiskUsageByBackingType(fromEncoded, toEncoded)
        if err != nil {
                return 0, 0, 0, err
        }
        return bytes, remoteBytes, externalBytes, nil
}

// Compact implements the Engine interface.
func (p *Pebble) Compact() error {
        return p.db.Compact(nil, EncodeMVCCKey(MVCCKeyMax), true /* parallel */)
}

// CompactRange implements the Engine interface.
func (p *Pebble) CompactRange(start, end roachpb.Key) error {
        // TODO(jackson): Consider changing Engine.CompactRange's signature to take
        // in EngineKeys so that it's unambiguous that the arguments have already
        // been encoded as engine keys. We do need to encode these keys in protocol
        // buffers when they're sent over the wire during the
        // crdb_internal.compact_engine_span builtin. Maybe we should have a
        // roachpb.Key equivalent for EngineKey so we don't lose that type
        // information?
        if ek, ok := DecodeEngineKey(start); !ok || ek.Validate() != nil {
                return errors.Errorf("invalid start key: %q", start)
        }
        if ek, ok := DecodeEngineKey(end); !ok || ek.Validate() != nil {
                return errors.Errorf("invalid end key: %q", end)
        }
        return p.db.Compact(start, end, true /* parallel */)
}

// RegisterFlushCompletedCallback implements the Engine interface.
func (p *Pebble) RegisterFlushCompletedCallback(cb func()) {
        p.mu.Lock()
        p.mu.flushCompletedCallback = cb
        p.mu.Unlock()
}

func checkpointSpansNote(spans []roachpb.Span) []byte {
        note := "CRDB spans:\n"
        for _, span := range spans {
                note += span.String() + "\n"
        }
        return []byte(note)
}

// CreateCheckpoint implements the Engine interface.
func (p *Pebble) CreateCheckpoint(dir string, spans []roachpb.Span) error {
        opts := []pebble.CheckpointOption{
                pebble.WithFlushedWAL(),
        }
        if l := len(spans); l > 0 {
                s := make([]pebble.CheckpointSpan, 0, l)
                for _, span := range spans {
                        s = append(s, pebble.CheckpointSpan{
                                Start: EngineKey{Key: span.Key}.Encode(),
                                End:   EngineKey{Key: span.EndKey}.Encode(),
                        })
                }
                opts = append(opts, pebble.WithRestrictToSpans(s))
        }
        if err := p.db.Checkpoint(dir, opts...); err != nil {
                return err
        }

        // Write out the min version file.
        if err := writeMinVersionFile(p.cfg.env.UnencryptedFS, dir, p.MinVersion()); err != nil {
                return errors.Wrapf(err, "writing min version file for checkpoint")
        }

        // TODO(#90543, cockroachdb/pebble#2285): move spans info to Pebble manifest.
        if len(spans) > 0 {
                if err := fs.SafeWriteToFile(
                        p.cfg.env, dir, p.cfg.env.PathJoin(dir, "checkpoint.txt"),
                        checkpointSpansNote(spans),
                        fs.UnspecifiedWriteCategory,
                ); err != nil {
                        return err
                }
        }

        return nil
}

// pebbleFormatVersionMap maps cluster versions to the corresponding pebble
// format version. For a given cluster version, the entry with the latest
// version that is not newer than the given version is chosen.
//
// This map needs to include the "final" version for each supported previous
// release, and the in-development versions of the current release and the
// previous one (for experimental version skipping during upgrade).
//
// Pebble has a concept of format major versions, similar to cluster versions.
// Backwards incompatible changes to Pebble's on-disk format are gated behind
// new format major versions. Bumping the storage engine's format major version
// is tied to a CockroachDB cluster version.
//
// Format major versions and cluster versions both only ratchet upwards. Here we
// map the persisted cluster version to the corresponding format major version,
// ratcheting Pebble's format major version if necessary.
//
// The pebble versions are advanced when nodes enter the "fence" version for the
// named cluster version, if there is one, so that if *any* node moves into the
// named version, it can be assumed all *nodes* have ratcheted to the pebble
// version associated with it, since they did so during the fence version.
var pebbleFormatVersionMap = map[clusterversion.Key]pebble.FormatMajorVersion{
        clusterversion.V24_3: pebble.FormatColumnarBlocks,
}

// pebbleFormatVersionKeys contains the keys in the map above, in descending order.
var pebbleFormatVersionKeys []clusterversion.Key = func() []clusterversion.Key {
        versionKeys := make([]clusterversion.Key, 0, len(pebbleFormatVersionMap))
        for k := range pebbleFormatVersionMap {
                versionKeys = append(versionKeys, k)
        }
        // Sort the keys in reverse order.
        sort.Slice(versionKeys, func(i, j int) bool {
                return versionKeys[i] > versionKeys[j]
        })
        return versionKeys
}()

// pebbleFormatVersion finds the most recent pebble format version supported by
// the given cluster version.
func pebbleFormatVersion(clusterVersion roachpb.Version) pebble.FormatMajorVersion {
        // pebbleFormatVersionKeys are sorted in descending order; find the first one
        // that is not newer than clusterVersion.
        for _, k := range pebbleFormatVersionKeys {
                if clusterVersion.AtLeast(k.Version().FenceVersion()) {
                        return pebbleFormatVersionMap[k]
                }
        }
        // This should never happen in production. But we tolerate tests creating
        // imaginary older versions; we must still use the earliest supported
        // format.
        return MinimumSupportedFormatVersion
}

// SetMinVersion implements the Engine interface.
func (p *Pebble) SetMinVersion(version roachpb.Version) error {
        p.minVersion = version

        if p.cfg.env.IsReadOnly() {
                // Don't make any on-disk changes.
                return nil
        }

        // NB: SetMinVersion must be idempotent. It may called multiple
        // times with the same version.

        // Writing the min version file commits this storage engine to the
        // provided cluster version.
        if err := writeMinVersionFile(p.cfg.env.UnencryptedFS, p.cfg.env.Dir, version); err != nil {
                return err
        }

        // Set the shared object creator ID .
        if storeID := p.storeIDPebbleLog.Get(); storeID != 0 && storeID != base.TempStoreID {
                if err := p.db.SetCreatorID(uint64(storeID)); err != nil {
                        return err
                }
        }

        formatVers := pebbleFormatVersion(version)
        if p.db.FormatMajorVersion() < formatVers {
                if err := p.db.RatchetFormatMajorVersion(formatVers); err != nil {
                        return errors.Wrap(err, "ratcheting format major version")
                }
        }
        return nil
}

// MinVersion implements the Engine interface.
func (p *Pebble) MinVersion() roachpb.Version {
        return p.minVersion
}

// BufferedSize implements the Engine interface.
func (p *Pebble) BufferedSize() int {
        return 0
}

// ConvertFilesToBatchAndCommit implements the Engine interface.
func (p *Pebble) ConvertFilesToBatchAndCommit(
        _ context.Context, paths []string, clearedSpans []roachpb.Span,
) error {
        files := make([]sstable.ReadableFile, len(paths))
        closeFiles := func() {
                for i := range files {
                        if files[i] != nil {
                                files[i].Close()
                        }
                }
        }
        for i, fileName := range paths {
                f, err := p.cfg.env.Open(fileName)
                if err != nil {
                        closeFiles()
                        return err
                }
                files[i] = f
        }
        iter, err := NewSSTEngineIterator(
                [][]sstable.ReadableFile{files},
                IterOptions{
                        KeyTypes:   IterKeyTypePointsAndRanges,
                        LowerBound: roachpb.KeyMin,
                        UpperBound: roachpb.KeyMax,
                })
        if err != nil {
                // TODO(sumeer): we don't call closeFiles() since in the error case some
                // of the files may be closed. See the code in
                // https://github.com/cockroachdb/pebble/blob/master/external_iterator.go#L104-L113
                // which closes the opened readers. At this point in the code we don't
                // know which files are already closed. The callee needs to be fixed to
                // not close any of the files or close all the files in the error case.
                // The natural behavior would be to not close any file. Fix this in
                // Pebble, and then adjust the code here if needed.
                return err
        }
        defer iter.Close()

        batch := p.NewWriteBatch()
        for i := range clearedSpans {
                err :=
                        batch.ClearRawRange(clearedSpans[i].Key, clearedSpans[i].EndKey, true, true)
                if err != nil {
                        return err
                }
        }
        valid, err := iter.SeekEngineKeyGE(EngineKey{Key: roachpb.KeyMin})
        for valid {
                hasPoint, hasRange := iter.HasPointAndRange()
                if hasPoint {
                        var k EngineKey
                        if k, err = iter.UnsafeEngineKey(); err != nil {
                                break
                        }
                        var v []byte
                        if v, err = iter.UnsafeValue(); err != nil {
                                break
                        }
                        if err = batch.PutEngineKey(k, v); err != nil {
                                break
                        }
                }
                if hasRange && iter.RangeKeyChanged() {
                        var rangeBounds roachpb.Span
                        if rangeBounds, err = iter.EngineRangeBounds(); err != nil {
                                break
                        }
                        rangeKeys := iter.EngineRangeKeys()
                        for i := range rangeKeys {
                                if err = batch.PutEngineRangeKey(rangeBounds.Key, rangeBounds.EndKey, rangeKeys[i].Version,
                                        rangeKeys[i].Value); err != nil {
                                        break
                                }
                        }
                        if err != nil {
                                break
                        }
                }
                valid, err = iter.NextEngineKey()
        }
        if err != nil {
                batch.Close()
                return err
        }
        return batch.Commit(true)
}

type pebbleReadOnly struct {
        parent *Pebble
        // The iterator reuse optimization in pebbleReadOnly is for servicing a
        // BatchRequest, such that the iterators get reused across different
        // requests in the batch.
        // Reuse iterators for {normal,prefix} x {MVCCKey,EngineKey} iteration. We
        // need separate iterators for EngineKey and MVCCKey iteration since
        // iterators that make separated locks/intents look as interleaved need to
        // use both simultaneously.
        // When the first iterator is initialized, or when
        // PinEngineStateForIterators is called (whichever happens first), the
        // underlying *pebble.Iterator is stashed in iter, so that subsequent
        // iterator initialization can use Iterator.Clone to use the same underlying
        // engine state. This relies on the fact that all pebbleIterators created
        // here are marked as reusable, which causes pebbleIterator.Close to not
        // close iter. iter will be closed when pebbleReadOnly.Close is called.
        prefixIter       pebbleIterator
        normalIter       pebbleIterator
        prefixEngineIter pebbleIterator
        normalEngineIter pebbleIterator

        iter       pebbleiter.Iterator
        iterUsed   bool // avoids cloning after PinEngineStateForIterators()
        durability DurabilityRequirement
        closed     bool
}

var _ ReadWriter = &pebbleReadOnly{}

var pebbleReadOnlyPool = sync.Pool{
        New: func() interface{} {
                return &pebbleReadOnly{
                        // Defensively set reusable=true. One has to be careful about this since
                        // an accidental false value would cause these iterators, that are value
                        // members of pebbleReadOnly, to be put in the pebbleIterPool.
                        prefixIter:       pebbleIterator{reusable: true},
                        normalIter:       pebbleIterator{reusable: true},
                        prefixEngineIter: pebbleIterator{reusable: true},
                        normalEngineIter: pebbleIterator{reusable: true},
                }
        },
}

// Instantiates a new pebbleReadOnly.
func newPebbleReadOnly(parent *Pebble, durability DurabilityRequirement) *pebbleReadOnly {
        p := pebbleReadOnlyPool.Get().(*pebbleReadOnly)
        // When p is a reused pebbleReadOnly from the pool, the iter fields preserve
        // the original reusable=true that was set above in pebbleReadOnlyPool.New(),
        // and some buffers that are safe to reuse. Everything else has been reset by
        // pebbleIterator.destroy().
        *p = pebbleReadOnly{
                parent:           parent,
                prefixIter:       p.prefixIter,
                normalIter:       p.normalIter,
                prefixEngineIter: p.prefixEngineIter,
                normalEngineIter: p.normalEngineIter,
                durability:       durability,
        }
        return p
}

func (p *pebbleReadOnly) Close() {
        if p.closed {
                panic("closing an already-closed pebbleReadOnly")
        }
        p.closed = true
        if p.iter != nil && !p.iterUsed {
                err := p.iter.Close()
                if err != nil {
                        panic(err)
                }
        }

        // Setting iter to nil is sufficient since it will be closed by one of the
        // subsequent destroy calls.
        p.iter = nil
        p.prefixIter.destroy()
        p.normalIter.destroy()
        p.prefixEngineIter.destroy()
        p.normalEngineIter.destroy()
        p.durability = StandardDurability

        pebbleReadOnlyPool.Put(p)
}

func (p *pebbleReadOnly) Closed() bool {
        return p.closed
}

func (p *pebbleReadOnly) MVCCIterate(
        ctx context.Context,
        start, end roachpb.Key,
        iterKind MVCCIterKind,
        keyTypes IterKeyType,
        readCategory fs.ReadCategory,
        f func(MVCCKeyValue, MVCCRangeKeyStack) error,
) error {
        if p.closed {
                panic("using a closed pebbleReadOnly")
        }
        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                err := iterateOnReader(ctx, r, start, end, iterKind, keyTypes, readCategory, f)
                r.Free()
                return err
        }
        return iterateOnReader(ctx, p, start, end, iterKind, keyTypes, readCategory, f)
}

// NewMVCCIterator implements the Engine interface.
func (p *pebbleReadOnly) NewMVCCIterator(
        ctx context.Context, iterKind MVCCIterKind, opts IterOptions,
) (MVCCIterator, error) {
        if p.closed {
                panic("using a closed pebbleReadOnly")
        }

        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                iter, err := r.NewMVCCIterator(ctx, iterKind, opts)
                r.Free()
                if err != nil {
                        return nil, err
                }
                return maybeWrapInUnsafeIter(iter), nil
        }

        iter := &p.normalIter
        if opts.Prefix {
                iter = &p.prefixIter
        }
        if iter.inuse {
                return newPebbleIteratorByCloning(ctx, CloneContext{
                        rawIter: p.iter,
                        engine:  p.parent,
                }, opts, p.durability), nil
        }

        if iter.iter != nil {
                iter.setOptions(ctx, opts, p.durability)
        } else {
                if err := iter.initReuseOrCreate(
                        ctx, p.parent.db, p.iter, p.iterUsed, opts, p.durability, p.parent); err != nil {
                        return nil, err
                }
                if p.iter == nil {
                        // For future cloning.
                        p.iter = iter.iter
                }
                p.iterUsed = true
                iter.reusable = true
        }

        iter.inuse = true
        return maybeWrapInUnsafeIter(iter), nil
}

// NewEngineIterator implements the Engine interface.
func (p *pebbleReadOnly) NewEngineIterator(
        ctx context.Context, opts IterOptions,
) (EngineIterator, error) {
        if p.closed {
                panic("using a closed pebbleReadOnly")
        }

        iter := &p.normalEngineIter
        if opts.Prefix {
                iter = &p.prefixEngineIter
        }
        if iter.inuse {
                return newPebbleIteratorByCloning(ctx, CloneContext{
                        rawIter: p.iter,
                        engine:  p.parent,
                }, opts, p.durability), nil
        }

        if iter.iter != nil {
                iter.setOptions(ctx, opts, p.durability)
        } else {
                err := iter.initReuseOrCreate(
                        ctx, p.parent.db, p.iter, p.iterUsed, opts, p.durability, p.parent)
                if err != nil {
                        return nil, err
                }
                if p.iter == nil {
                        // For future cloning.
                        p.iter = iter.iter
                }
                p.iterUsed = true
                iter.reusable = true
        }

        iter.inuse = true
        return iter, nil
}

// ConsistentIterators implements the Engine interface.
func (p *pebbleReadOnly) ConsistentIterators() bool {
        return true
}

// PinEngineStateForIterators implements the Engine interface.
func (p *pebbleReadOnly) PinEngineStateForIterators(readCategory fs.ReadCategory) error {
        if p.iter == nil {
                o := &pebble.IterOptions{Category: readCategory.PebbleCategory()}
                if p.durability == GuaranteedDurability {
                        o.OnlyReadGuaranteedDurable = true
                }
                iter, err := p.parent.db.NewIter(o)
                if err != nil {
                        return err
                }
                p.iter = pebbleiter.MaybeWrap(iter)
                // NB: p.iterUsed == false avoids cloning this in NewMVCCIterator(), since
                // we've just created it.
        }
        return nil
}

// ScanInternal implements the Reader interface.
func (p *pebbleReadOnly) ScanInternal(
        ctx context.Context,
        lower, upper roachpb.Key,
        visitPointKey func(key *pebble.InternalKey, value pebble.LazyValue, info pebble.IteratorLevel) error,
        visitRangeDel func(start []byte, end []byte, seqNum pebble.SeqNum) error,
        visitRangeKey func(start []byte, end []byte, keys []rangekey.Key) error,
        visitSharedFile func(sst *pebble.SharedSSTMeta) error,
        visitExternalFile func(sst *pebble.ExternalFile) error,
) error {
        return p.parent.ScanInternal(ctx, lower, upper, visitPointKey, visitRangeDel, visitRangeKey, visitSharedFile, visitExternalFile)
}

// Writer methods are not implemented for pebbleReadOnly. Ideally, the code
// could be refactored so that a Reader could be supplied to evaluateBatch

// Writer is the write interface to an engine's data.

func (p *pebbleReadOnly) ApplyBatchRepr(repr []byte, sync bool) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) ClearMVCC(key MVCCKey, opts ClearOptions) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) ClearUnversioned(key roachpb.Key, opts ClearOptions) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) ClearEngineKey(key EngineKey, opts ClearOptions) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) SingleClearEngineKey(key EngineKey) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) ClearRawRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) ClearMVCCRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) ClearMVCCVersions(start, end MVCCKey) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) ClearMVCCIteratorRange(
        start, end roachpb.Key, pointKeys, rangeKeys bool,
) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) PutMVCCRangeKey(MVCCRangeKey, MVCCValue) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) PutRawMVCCRangeKey(MVCCRangeKey, []byte) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) PutEngineRangeKey(roachpb.Key, roachpb.Key, []byte, []byte) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) ClearEngineRangeKey(roachpb.Key, roachpb.Key, []byte) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) ClearMVCCRangeKey(MVCCRangeKey) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) Merge(key MVCCKey, value []byte) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) PutMVCC(key MVCCKey, value MVCCValue) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) PutRawMVCC(key MVCCKey, value []byte) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) PutUnversioned(key roachpb.Key, value []byte) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) PutEngineKey(key EngineKey, value []byte) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) LogData(data []byte) error {
        panic("not implemented")
}

func (p *pebbleReadOnly) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
        panic("not implemented")
}

func (p *pebbleReadOnly) ShouldWriteLocalTimestamps(ctx context.Context) bool {
        panic("not implemented")
}

func (p *pebbleReadOnly) BufferedSize() int {
        panic("not implemented")
}

// pebbleSnapshot represents a snapshot created using Pebble.NewSnapshot().
type pebbleSnapshot struct {
        snapshot *pebble.Snapshot
        parent   *Pebble
        closed   bool
}

var _ Reader = &pebbleSnapshot{}

// Close implements the Reader interface.
func (p *pebbleSnapshot) Close() {
        _ = p.snapshot.Close()
        p.closed = true
}

// Closed implements the Reader interface.
func (p *pebbleSnapshot) Closed() bool {
        return p.closed
}

// MVCCIterate implements the Reader interface.
func (p *pebbleSnapshot) MVCCIterate(
        ctx context.Context,
        start, end roachpb.Key,
        iterKind MVCCIterKind,
        keyTypes IterKeyType,
        readCategory fs.ReadCategory,
        f func(MVCCKeyValue, MVCCRangeKeyStack) error,
) error {
        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                err := iterateOnReader(ctx, r, start, end, iterKind, keyTypes, readCategory, f)
                r.Free()
                return err
        }
        return iterateOnReader(ctx, p, start, end, iterKind, keyTypes, readCategory, f)
}

// NewMVCCIterator implements the Reader interface.
func (p *pebbleSnapshot) NewMVCCIterator(
        ctx context.Context, iterKind MVCCIterKind, opts IterOptions,
) (MVCCIterator, error) {
        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                iter, err := r.NewMVCCIterator(ctx, iterKind, opts)
                r.Free()
                if err != nil {
                        return nil, err
                }
                return maybeWrapInUnsafeIter(iter), nil
        }

        iter, err := newPebbleIterator(ctx, p.snapshot, opts, StandardDurability, p.parent)
        if err != nil {
                return nil, err
        }
        return maybeWrapInUnsafeIter(MVCCIterator(iter)), nil
}

// NewEngineIterator implements the Reader interface.
func (p pebbleSnapshot) NewEngineIterator(
        ctx context.Context, opts IterOptions,
) (EngineIterator, error) {
        return newPebbleIterator(ctx, p.snapshot, opts, StandardDurability, p.parent)
}

// ConsistentIterators implements the Reader interface.
func (p pebbleSnapshot) ConsistentIterators() bool {
        return true
}

// PinEngineStateForIterators implements the Reader interface.
func (p *pebbleSnapshot) PinEngineStateForIterators(fs.ReadCategory) error {
        // Snapshot already pins state, so nothing to do.
        return nil
}

// ScanInternal implements the Reader interface.
func (p *pebbleSnapshot) ScanInternal(
        ctx context.Context,
        lower, upper roachpb.Key,
        visitPointKey func(key *pebble.InternalKey, value pebble.LazyValue, info pebble.IteratorLevel) error,
        visitRangeDel func(start []byte, end []byte, seqNum pebble.SeqNum) error,
        visitRangeKey func(start []byte, end []byte, keys []rangekey.Key) error,
        visitSharedFile func(sst *pebble.SharedSSTMeta) error,
        visitExternalFile func(sst *pebble.ExternalFile) error,
) error {
        rawLower := EngineKey{Key: lower}.Encode()
        rawUpper := EngineKey{Key: upper}.Encode()
        // TODO(sumeer): set category.
        return p.snapshot.ScanInternal(ctx, block.CategoryUnknown, rawLower, rawUpper, visitPointKey,
                visitRangeDel, visitRangeKey, visitSharedFile, visitExternalFile)
}

// pebbleEFOS represents an eventually file-only snapshot created using
// NewEventuallyFileOnlySnapshot.
type pebbleEFOS struct {
        efos      *pebble.EventuallyFileOnlySnapshot
        parent    *Pebble
        keyRanges []roachpb.Span
        closed    bool
}

var _ EventuallyFileOnlyReader = &pebbleEFOS{}

// Close implements the Reader interface.
func (p *pebbleEFOS) Close() {
        _ = p.efos.Close()
        p.closed = true
}

// Closed implements the Reader interface.
func (p *pebbleEFOS) Closed() bool {
        return p.closed
}

// MVCCIterate implements the Reader interface.
func (p *pebbleEFOS) MVCCIterate(
        ctx context.Context,
        start, end roachpb.Key,
        iterKind MVCCIterKind,
        keyTypes IterKeyType,
        readCategory fs.ReadCategory,
        f func(MVCCKeyValue, MVCCRangeKeyStack) error,
) error {
        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                err := iterateOnReader(ctx, r, start, end, iterKind, keyTypes, readCategory, f)
                r.Free()
                return err
        }
        return iterateOnReader(ctx, p, start, end, iterKind, keyTypes, readCategory, f)
}

// WaitForFileOnly implements the EventuallyFileOnlyReader interface.
func (p *pebbleEFOS) WaitForFileOnly(
        ctx context.Context, gracePeriodBeforeFlush time.Duration,
) error {
        return p.efos.WaitForFileOnlySnapshot(ctx, gracePeriodBeforeFlush)
}

// NewMVCCIterator implements the Reader interface.
func (p *pebbleEFOS) NewMVCCIterator(
        ctx context.Context, iterKind MVCCIterKind, opts IterOptions,
) (MVCCIterator, error) {
        // Check if the bounds fall within the EFOS' keyRanges. We can only do this
        // check for non-prefix iterators as prefix iterators often don't specify
        // any bounds.
        if !opts.Prefix {
                if opts.LowerBound == nil || opts.UpperBound == nil {
                        return nil, errors.AssertionFailedf("cannot create iterators on EFOS without bounds")
                }
                var found bool
                boundSpan := roachpb.Span{Key: opts.LowerBound, EndKey: opts.UpperBound}
                for i := range p.keyRanges {
                        if p.keyRanges[i].Contains(boundSpan) {
                                found = true
                                break
                        }
                }
                if !found {
                        return nil, errors.AssertionFailedf("iterator bounds exceed eventually-file-only-snapshot key ranges: %s", boundSpan.String())
                }
        }
        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                iter, err := r.NewMVCCIterator(ctx, iterKind, opts)
                r.Free()
                if err != nil {
                        return nil, err
                }
                return maybeWrapInUnsafeIter(iter), nil
        }

        iter, err := newPebbleIterator(ctx, p.efos, opts, StandardDurability, p.parent)
        if err != nil {
                return nil, err
        }
        return maybeWrapInUnsafeIter(MVCCIterator(iter)), nil
}

// NewEngineIterator implements the Reader interface.
func (p *pebbleEFOS) NewEngineIterator(
        ctx context.Context, opts IterOptions,
) (EngineIterator, error) {
        return newPebbleIterator(ctx, p.efos, opts, StandardDurability, p.parent)
}

// ConsistentIterators implements the Reader interface.
func (p *pebbleEFOS) ConsistentIterators() bool {
        return true
}

// PinEngineStateForIterators implements the Reader interface.
func (p *pebbleEFOS) PinEngineStateForIterators(fs.ReadCategory) error {
        // Snapshot already pins state, so nothing to do.
        return nil
}

// ScanInternal implements the Reader interface.
func (p *pebbleEFOS) ScanInternal(
        ctx context.Context,
        lower, upper roachpb.Key,
        visitPointKey func(key *pebble.InternalKey, value pebble.LazyValue, info pebble.IteratorLevel) error,
        visitRangeDel func(start []byte, end []byte, seqNum pebble.SeqNum) error,
        visitRangeKey func(start []byte, end []byte, keys []rangekey.Key) error,
        visitSharedFile func(sst *pebble.SharedSSTMeta) error,
        visitExternalFile func(sst *pebble.ExternalFile) error,
) error {
        rawLower := EngineKey{Key: lower}.Encode()
        rawUpper := EngineKey{Key: upper}.Encode()
        // TODO(sumeer): set category.
        return p.efos.ScanInternal(ctx, block.CategoryUnknown, rawLower, rawUpper, visitPointKey,
                visitRangeDel, visitRangeKey, visitSharedFile, visitExternalFile)
}

// ExceedMaxSizeError is the error returned when an export request
// fails due the export size exceeding the budget. This can be caused
// by large KVs that have many revisions.
type ExceedMaxSizeError struct {
        reached int64
        maxSize uint64
}

var _ error = &ExceedMaxSizeError{}

func (e *ExceedMaxSizeError) Error() string {
        return fmt.Sprintf("export size (%d bytes) exceeds max size (%d bytes)", e.reached, e.maxSize)
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"
        "sync"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
        "github.com/cockroachdb/pebble/rangekey"
)

var (
        writeBatchPool = sync.Pool{
                New: func() interface{} {
                        return &writeBatch{}
                },
        }
        readWriteBatchPool = sync.Pool{
                New: func() interface{} {
                        return &pebbleBatch{}
                },
        }
)

// Instantiates a new writeBatch.
func newWriteBatch(
        db *pebble.DB,
        batch *pebble.Batch,
        settings *cluster.Settings,
        parent *Pebble,
        batchStatsReporter batchStatsReporter,
) *writeBatch {
        wb := writeBatchPool.Get().(*writeBatch)
        *wb = writeBatch{
                db:                 db,
                batch:              batch,
                buf:                wb.buf,
                parent:             parent,
                batchStatsReporter: batchStatsReporter,
                settings:           settings,
        }
        return wb
}

// A writeBatch wraps a pebble.Batch, omitting any facilities for reading. It's
// used when only a WriteBatch is needed.
type writeBatch struct {
        db                               *pebble.DB
        batch                            *pebble.Batch
        buf                              []byte
        parent                           *Pebble
        batchStatsReporter               batchStatsReporter
        settings                         *cluster.Settings
        closed                           bool
        shouldWriteLocalTimestamps       bool
        shouldWriteLocalTimestampsCached bool
}

var _ WriteBatch = (*writeBatch)(nil)

type batchStatsReporter interface {
        aggregateBatchCommitStats(stats BatchCommitStats)
}

// ApplyBatchRepr implements the Writer interface.
func (wb *writeBatch) ApplyBatchRepr(repr []byte, sync bool) error {
        var batch pebble.Batch
        if err := batch.SetRepr(repr); err != nil {
                return err
        }
        return wb.batch.Apply(&batch, nil)
}

// ClearMVCC implements the Writer interface.
func (wb *writeBatch) ClearMVCC(key MVCCKey, opts ClearOptions) error {
        if key.Timestamp.IsEmpty() {
                panic("ClearMVCC timestamp is empty")
        }
        return wb.clear(key, opts)
}

// ClearUnversioned implements the Writer interface.
func (wb *writeBatch) ClearUnversioned(key roachpb.Key, opts ClearOptions) error {
        return wb.clear(MVCCKey{Key: key}, opts)
}

// ClearEngineKey implements the Writer interface.
func (wb *writeBatch) ClearEngineKey(key EngineKey, opts ClearOptions) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        wb.buf = key.EncodeToBuf(wb.buf[:0])
        if !opts.ValueSizeKnown {
                return wb.batch.Delete(wb.buf, nil)
        }
        return wb.batch.DeleteSized(wb.buf, opts.ValueSize, nil)
}

// ClearMVCCIteratorRange implements the Batch interface.
func (wb *writeBatch) ClearMVCCIteratorRange(
        start, end roachpb.Key, pointKeys, rangeKeys bool,
) error {
        // TODO(jackson): Remove this method. See the TODO in its definition within
        // the Writer interface.
        panic("batch is write-only")
}

func (wb *writeBatch) clear(key MVCCKey, opts ClearOptions) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }

        wb.buf = EncodeMVCCKeyToBuf(wb.buf[:0], key)
        if !opts.ValueSizeKnown {
                return wb.batch.Delete(wb.buf, nil)
        }
        return wb.batch.DeleteSized(wb.buf, opts.ValueSize, nil)
}

// SingleClearEngineKey implements the Writer interface.
func (wb *writeBatch) SingleClearEngineKey(key EngineKey) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        wb.buf = key.EncodeToBuf(wb.buf[:0])
        return wb.batch.SingleDelete(wb.buf, nil)
}

// ClearRawRange implements the Writer interface.
func (wb *writeBatch) ClearRawRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error {
        wb.buf = EngineKey{Key: start}.EncodeToBuf(wb.buf[:0])
        endRaw := EngineKey{Key: end}.Encode()
        if pointKeys {
                if err := wb.batch.DeleteRange(wb.buf, endRaw, pebble.Sync); err != nil {
                        return err
                }
        }
        if rangeKeys {
                if err := wb.batch.RangeKeyDelete(wb.buf, endRaw, pebble.Sync); err != nil {
                        return err
                }
        }
        return nil
}

// ClearMVCCRange implements the Writer interface.
func (wb *writeBatch) ClearMVCCRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error {
        if err := wb.ClearRawRange(start, end, pointKeys, rangeKeys); err != nil {
                return err
        }
        // The lock table only contains point keys, so only clear it when point keys
        // are requested, and don't clear range keys in it.
        if !pointKeys {
                return nil
        }
        lstart, _ := keys.LockTableSingleKey(start, nil)
        lend, _ := keys.LockTableSingleKey(end, nil)
        return wb.ClearRawRange(lstart, lend, true /* pointKeys */, false /* rangeKeys */)
}

// ClearMVCCVersions implements the Writer interface.
func (wb *writeBatch) ClearMVCCVersions(start, end MVCCKey) error {
        wb.buf = EncodeMVCCKeyToBuf(wb.buf[:0], start)
        return wb.batch.DeleteRange(wb.buf, EncodeMVCCKey(end), nil)
}

// ClearMVCCRangeKey implements the Writer interface.
func (wb *writeBatch) ClearMVCCRangeKey(rangeKey MVCCRangeKey) error {
        if err := rangeKey.Validate(); err != nil {
                return err
        }
        // If the range key holds an encoded timestamp as it was read from storage,
        // write the tombstone to clear it using the same encoding of the timestamp.
        // See #129592.
        if len(rangeKey.EncodedTimestampSuffix) > 0 {
                return wb.ClearEngineRangeKey(
                        rangeKey.StartKey, rangeKey.EndKey, rangeKey.EncodedTimestampSuffix)
        }
        return wb.ClearEngineRangeKey(
                rangeKey.StartKey, rangeKey.EndKey, EncodeMVCCTimestampSuffix(rangeKey.Timestamp))
}

// BufferedSize implements the Writer interface.
func (wb *writeBatch) BufferedSize() int {
        return wb.Len()
}

// PutMVCCRangeKey implements the Writer interface.
func (wb *writeBatch) PutMVCCRangeKey(rangeKey MVCCRangeKey, value MVCCValue) error {
        // NB: all MVCC APIs currently assume all range keys are range tombstones.
        if !value.IsTombstone() {
                return errors.New("range keys can only be MVCC range tombstones")
        }
        valueRaw, err := EncodeMVCCValue(value)
        if err != nil {
                return errors.Wrapf(err, "failed to encode MVCC value for range key %s", rangeKey)
        }
        return wb.PutRawMVCCRangeKey(rangeKey, valueRaw)
}

// PutRawMVCCRangeKey implements the Writer interface.
func (wb *writeBatch) PutRawMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error {
        if err := rangeKey.Validate(); err != nil {
                return err
        }
        // NB: We deliberately do not use rangeKey.EncodedTimestampSuffix even if
        // it's present, because we explicitly do NOT want to write range keys with
        // the synthetic bit set.
        return wb.PutEngineRangeKey(
                rangeKey.StartKey, rangeKey.EndKey, EncodeMVCCTimestampSuffix(rangeKey.Timestamp), value)
}

// PutEngineRangeKey implements the Writer interface.
func (wb *writeBatch) PutEngineRangeKey(start, end roachpb.Key, suffix, value []byte) error {
        return wb.batch.RangeKeySet(
                EngineKey{Key: start}.Encode(), EngineKey{Key: end}.Encode(), suffix, value, nil)
}

// ClearRawEncodedRange implements the InternalWriter interface.
func (wb *writeBatch) ClearRawEncodedRange(start, end []byte) error {
        return wb.batch.DeleteRange(start, end, pebble.Sync)
}

// PutInternalRangeKey implements the InternalWriter interface.
func (wb *writeBatch) PutInternalRangeKey(start, end []byte, key rangekey.Key) error {
        switch key.Kind() {
        case pebble.InternalKeyKindRangeKeyUnset:
                return wb.batch.RangeKeyUnset(start, end, key.Suffix, nil /* writeOptions */)
        case pebble.InternalKeyKindRangeKeySet:
                return wb.batch.RangeKeySet(start, end, key.Suffix, key.Value, nil /* writeOptions */)
        case pebble.InternalKeyKindRangeKeyDelete:
                return wb.batch.RangeKeyDelete(start, end, nil /* writeOptions */)
        default:
                panic("unexpected range key kind")
        }
}

// PutInternalPointKey implements the InternalWriter interface.
func (wb *writeBatch) PutInternalPointKey(key *pebble.InternalKey, value []byte) error {
        if len(key.UserKey) == 0 {
                return emptyKeyError()
        }
        return wb.batch.AddInternalKey(key, value, nil /* writeOptions */)
}

// ClearEngineRangeKey implements the Engine interface.
func (wb *writeBatch) ClearEngineRangeKey(start, end roachpb.Key, suffix []byte) error {
        return wb.batch.RangeKeyUnset(
                EngineKey{Key: start}.Encode(), EngineKey{Key: end}.Encode(), suffix, nil)
}

// Merge implements the Writer interface.
func (wb *writeBatch) Merge(key MVCCKey, value []byte) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        wb.buf = EncodeMVCCKeyToBuf(wb.buf[:0], key)
        return wb.batch.Merge(wb.buf, value, nil)
}

// PutMVCC implements the Writer interface.
func (wb *writeBatch) PutMVCC(key MVCCKey, value MVCCValue) error {
        if key.Timestamp.IsEmpty() {
                panic("PutMVCC timestamp is empty")
        }
        return wb.putMVCC(key, value)
}

// PutRawMVCC implements the Writer interface.
func (wb *writeBatch) PutRawMVCC(key MVCCKey, value []byte) error {
        if key.Timestamp.IsEmpty() {
                panic("PutRawMVCC timestamp is empty")
        }
        return wb.put(key, value)
}

// PutUnversioned implements the Writer interface.
func (wb *writeBatch) PutUnversioned(key roachpb.Key, value []byte) error {
        return wb.put(MVCCKey{Key: key}, value)
}

// PutEngineKey implements the Writer interface.
func (wb *writeBatch) PutEngineKey(key EngineKey, value []byte) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        wb.buf = key.EncodeToBuf(wb.buf[:0])
        return wb.batch.Set(wb.buf, value, nil)
}

func (wb *writeBatch) putMVCC(key MVCCKey, value MVCCValue) error {
        // For performance, this method uses the pebble Batch's deferred operation
        // API to avoid an extra memcpy. We:
        // - determine the length of the encoded MVCC key and MVCC value
        // - reserve space in the pebble Batch using SetDeferred
        // - encode the MVCC key and MVCC value directly into the Batch
        // - call Finish on the deferred operation (which will index the key if
        //   wb.batch is indexed)
        valueLen, isExtended := mvccValueSize(value)
        keyLen := encodedMVCCKeyLength(key)
        o := wb.batch.SetDeferred(keyLen, valueLen)
        encodeMVCCKeyToBuf(o.Key, key, keyLen)
        if !isExtended {
                // Fast path; we don't need to use the extended encoding and can copy
                // RawBytes in verbatim.
                copy(o.Value, value.Value.RawBytes)
        } else {
                // Slow path; we need the MVCC value header.
                err := encodeExtendedMVCCValueToSizedBuf(value, o.Value)
                if err != nil {
                        return err
                }
        }
        return o.Finish()
}

func (wb *writeBatch) put(key MVCCKey, value []byte) error {
        if len(key.Key) == 0 {
                return emptyKeyError()
        }
        keyLen := encodedMVCCKeyLength(key)
        o := wb.batch.SetDeferred(keyLen, len(value))
        encodeMVCCKeyToBuf(o.Key, key, keyLen)
        copy(o.Value, value)
        return o.Finish()
}

// LogData implements the Writer interface.
func (wb *writeBatch) LogData(data []byte) error {
        return wb.batch.LogData(data, nil)
}

func (wb *writeBatch) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
        // No-op.
}

// Commit implements the WriteBatch interface.
func (wb *writeBatch) Commit(sync bool) error {
        opts := pebble.NoSync
        if sync {
                opts = pebble.Sync
        }
        if wb.batch == nil {
                panic("called with nil batch")
        }
        err := wb.batch.Commit(opts)
        if err != nil {
                // TODO(storage): ensure that these errors are only ever due to invariant
                // violations and never due to unrecoverable Pebble states. Then switch to
                // returning the error instead of panicking.
                //
                // Once we do that, document on the storage.Batch interface the meaning of
                // an error returned from this method and the guarantees that callers have
                // or don't have after they receive an error from this method.
                panic(err)
        }
        wb.batchStatsReporter.aggregateBatchCommitStats(
                BatchCommitStats{wb.batch.CommitStats()})
        return err
}

// CommitNoSyncWait implements the WriteBatch interface.
func (wb *writeBatch) CommitNoSyncWait() error {
        if wb.batch == nil {
                panic("called with nil batch")
        }
        err := wb.db.ApplyNoSyncWait(wb.batch, pebble.Sync)
        if err != nil {
                // TODO(storage): ensure that these errors are only ever due to invariant
                // violations and never due to unrecoverable Pebble states. Then switch to
                // returning the error instead of panicking.
                //
                // Once we do that, document on the storage.Batch interface the meaning of
                // an error returned from this method and the guarantees that callers have
                // or don't have after they receive an error from this method.
                panic(err)
        }
        return err
}

// SyncWait implements the WriteBatch interface.
func (wb *writeBatch) SyncWait() error {
        if wb.batch == nil {
                panic("called with nil batch")
        }
        err := wb.batch.SyncWait()
        if err != nil {
                // TODO(storage): ensure that these errors are only ever due to invariant
                // violations and never due to unrecoverable Pebble states. Then switch to
                // returning the error instead of panicking.
                //
                // Once we do that, document on the storage.Batch interface the meaning of
                // an error returned from this method and the guarantees that callers have
                // or don't have after they receive an error from this method.
                panic(err)
        }
        wb.batchStatsReporter.aggregateBatchCommitStats(
                BatchCommitStats{wb.batch.CommitStats()})
        return err
}

// Empty implements the WriteBatch interface.
func (wb *writeBatch) Empty() bool {
        return wb.batch.Count() == 0
}

// Count implements the WriteBatch interface.
func (wb *writeBatch) Count() uint32 {
        return wb.batch.Count()
}

// Len implements the WriteBatch interface.
func (wb *writeBatch) Len() int {
        return len(wb.batch.Repr())
}

// Repr implements the WriteBatch interface.
func (wb *writeBatch) Repr() []byte {
        // Repr expects a "safe" byte slice as its output. The return value of
        // p.batch.Repr() is an unsafe byte slice owned by p.batch. Since we could be
        // sending this slice over the wire, we need to make a copy.
        repr := wb.batch.Repr()
        reprCopy := make([]byte, len(repr))
        copy(reprCopy, repr)
        return reprCopy
}

// CommitStats implements the WriteBatch interface.
func (wb *writeBatch) CommitStats() BatchCommitStats {
        return BatchCommitStats{BatchCommitStats: wb.batch.CommitStats()}
}

// ShouldWriteLocalTimestamps implements the WriteBatch interface.
func (wb *writeBatch) ShouldWriteLocalTimestamps(ctx context.Context) bool {
        // pebbleBatch is short-lived, so cache the value for performance.
        if !wb.shouldWriteLocalTimestampsCached {
                wb.shouldWriteLocalTimestamps = shouldWriteLocalTimestamps(ctx, wb.settings)
                wb.shouldWriteLocalTimestampsCached = true
        }
        return wb.shouldWriteLocalTimestamps
}

// Close implements the WriteBatch interface.
func (wb *writeBatch) Close() {
        wb.close()
        writeBatchPool.Put(wb)
}

func (wb *writeBatch) close() {
        if wb.closed {
                panic("closing an already-closed writeBatch")
        }
        wb.closed = true
        _ = wb.batch.Close()
        wb.batch = nil
}

// Wrapper struct around a pebble.Batch.
type pebbleBatch struct {
        writeBatch
        // The iterator reuse optimization in pebbleBatch is for servicing a
        // BatchRequest, such that the iterators get reused across different
        // requests in the batch.
        // Reuse iterators for {normal,prefix} x {MVCCKey,EngineKey} iteration. We
        // need separate iterators for EngineKey and MVCCKey iteration since
        // iterators that make separated locks/intents look as interleaved need to
        // use both simultaneously.
        // When the first iterator is initialized, or when
        // PinEngineStateForIterators is called (whichever happens first), the
        // underlying *pebble.Iterator is stashed in iter, so that subsequent
        // iterator initialization can use Iterator.Clone to use the same underlying
        // engine state. This relies on the fact that all pebbleIterators created
        // here are marked as reusable, which causes pebbleIterator.Close to not
        // close iter. iter will be closed when pebbleBatch.Close is called.
        prefixIter       pebbleIterator
        normalIter       pebbleIterator
        prefixEngineIter pebbleIterator
        normalEngineIter pebbleIterator

        iter     pebbleiter.Iterator
        iterUsed bool // avoids cloning after PinEngineStateForIterators()
}

var _ Batch = (*pebbleBatch)(nil)

// Instantiates a new pebbleBatch.
func newPebbleBatch(
        db *pebble.DB,
        batch *pebble.Batch,
        settings *cluster.Settings,
        parent *Pebble,
        batchStatsReporter batchStatsReporter,
) *pebbleBatch {
        pb := readWriteBatchPool.Get().(*pebbleBatch)
        *pb = pebbleBatch{
                writeBatch: writeBatch{
                        db:                 db,
                        batch:              batch,
                        buf:                pb.buf,
                        parent:             parent,
                        batchStatsReporter: batchStatsReporter,
                        settings:           settings,
                },
                prefixIter: pebbleIterator{
                        lowerBoundBuf: pb.prefixIter.lowerBoundBuf,
                        upperBoundBuf: pb.prefixIter.upperBoundBuf,
                        reusable:      true,
                },
                normalIter: pebbleIterator{
                        lowerBoundBuf: pb.normalIter.lowerBoundBuf,
                        upperBoundBuf: pb.normalIter.upperBoundBuf,
                        reusable:      true,
                },
                prefixEngineIter: pebbleIterator{
                        lowerBoundBuf: pb.prefixEngineIter.lowerBoundBuf,
                        upperBoundBuf: pb.prefixEngineIter.upperBoundBuf,
                        reusable:      true,
                },
                normalEngineIter: pebbleIterator{
                        lowerBoundBuf: pb.normalEngineIter.lowerBoundBuf,
                        upperBoundBuf: pb.normalEngineIter.upperBoundBuf,
                        reusable:      true,
                },
        }
        return pb
}

// Close implements the Batch interface.
func (p *pebbleBatch) Close() {
        if p.iter != nil && !p.iterUsed {
                if err := p.iter.Close(); err != nil {
                        panic(err)
                }
        }

        // Setting iter to nil is sufficient since it will be closed by one of the
        // subsequent destroy calls.
        p.iter = nil
        // Destroy the iterators before closing the batch.
        p.prefixIter.destroy()
        p.normalIter.destroy()
        p.prefixEngineIter.destroy()
        p.normalEngineIter.destroy()
        p.writeBatch.close()
        readWriteBatchPool.Put(p)
}

// Closed implements the Batch interface.
func (p *pebbleBatch) Closed() bool {
        return p.closed
}

// MVCCIterate implements the Batch interface.
func (p *pebbleBatch) MVCCIterate(
        ctx context.Context,
        start, end roachpb.Key,
        iterKind MVCCIterKind,
        keyTypes IterKeyType,
        readCategory fs.ReadCategory,
        f func(MVCCKeyValue, MVCCRangeKeyStack) error,
) error {
        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                err := iterateOnReader(ctx, r, start, end, iterKind, keyTypes, readCategory, f)
                r.Free()
                return err
        }
        return iterateOnReader(ctx, p, start, end, iterKind, keyTypes, readCategory, f)
}

// NewMVCCIterator implements the Batch interface.
func (p *pebbleBatch) NewMVCCIterator(
        ctx context.Context, iterKind MVCCIterKind, opts IterOptions,
) (MVCCIterator, error) {
        if iterKind == MVCCKeyAndIntentsIterKind {
                r := wrapReader(p)
                // Doing defer r.Free() does not inline.
                iter, err := r.NewMVCCIterator(ctx, iterKind, opts)
                r.Free()
                if err != nil {
                        return nil, err
                }
                return maybeWrapInUnsafeIter(iter), nil
        }

        iter := &p.normalIter
        if opts.Prefix {
                iter = &p.prefixIter
        }
        handle := pebble.Reader(p.batch)
        if !p.batch.Indexed() {
                handle = p.db
        }
        if iter.inuse {
                return newPebbleIteratorByCloning(ctx, CloneContext{
                        rawIter: p.iter,
                        engine:  p.parent,
                }, opts, StandardDurability), nil
        }

        if iter.iter != nil {
                iter.setOptions(ctx, opts, StandardDurability)
        } else {
                if err := iter.initReuseOrCreate(
                        ctx, handle, p.iter, p.iterUsed, opts, StandardDurability, p.parent); err != nil {
                        return nil, err
                }
                if p.iter == nil {
                        // For future cloning.
                        p.iter = iter.iter
                }
                p.iterUsed = true
        }

        iter.inuse = true
        return maybeWrapInUnsafeIter(iter), nil
}

// NewBatchOnlyMVCCIterator implements the Batch interface.
func (p *pebbleBatch) NewBatchOnlyMVCCIterator(
        ctx context.Context, opts IterOptions,
) (MVCCIterator, error) {
        if !p.batch.Indexed() {
                panic("unindexed batch")
        }
        var err error
        iter := pebbleIterPool.Get().(*pebbleIterator)
        iter.reusable = false // defensive
        iter.init(ctx, nil, opts, StandardDurability, p.parent)
        boIter, err := p.batch.NewBatchOnlyIter(ctx, &iter.options)
        if err != nil {
                iter.Close()
                panic(err)
        }
        iter.iter = pebbleiter.MaybeWrap(boIter)
        return iter, nil
}

// NewEngineIterator implements the Batch interface.
func (p *pebbleBatch) NewEngineIterator(
        ctx context.Context, opts IterOptions,
) (EngineIterator, error) {
        iter := &p.normalEngineIter
        if opts.Prefix {
                iter = &p.prefixEngineIter
        }
        handle := pebble.Reader(p.batch)
        if !p.batch.Indexed() {
                handle = p.db
        }
        if iter.inuse {
                return newPebbleIteratorByCloning(ctx, CloneContext{
                        rawIter: p.iter,
                        engine:  p.parent,
                }, opts, StandardDurability), nil
        }

        if iter.iter != nil {
                iter.setOptions(ctx, opts, StandardDurability)
        } else {
                if err := iter.initReuseOrCreate(
                        ctx, handle, p.iter, p.iterUsed, opts, StandardDurability, p.parent); err != nil {
                        return nil, err
                }
                if p.iter == nil {
                        // For future cloning.
                        p.iter = iter.iter
                }
                p.iterUsed = true
        }

        iter.inuse = true
        return iter, nil
}

// ScanInternal implements the Reader interface.
func (p *pebbleBatch) ScanInternal(
        ctx context.Context,
        lower, upper roachpb.Key,
        visitPointKey func(key *pebble.InternalKey, value pebble.LazyValue, info pebble.IteratorLevel) error,
        visitRangeDel func(start []byte, end []byte, seqNum pebble.SeqNum) error,
        visitRangeKey func(start []byte, end []byte, keys []rangekey.Key) error,
        visitSharedFile func(sst *pebble.SharedSSTMeta) error,
        visitExternalFile func(sst *pebble.ExternalFile) error,
) error {
        panic("ScanInternal only supported on Engine and Snapshot.")
}

// ConsistentIterators implements the Batch interface.
func (p *pebbleBatch) ConsistentIterators() bool {
        return true
}

// PinEngineStateForIterators implements the Batch interface.
func (p *pebbleBatch) PinEngineStateForIterators(readCategory fs.ReadCategory) error {
        var err error
        if p.iter == nil {
                var iter *pebble.Iterator
                o := &pebble.IterOptions{Category: readCategory.PebbleCategory()}
                if p.batch.Indexed() {
                        iter, err = p.batch.NewIter(o)
                } else {
                        iter, err = p.db.NewIter(o)
                }
                if err != nil {
                        return err
                }
                p.iter = pebbleiter.MaybeWrap(iter)
                // NB: p.iterUsed == false avoids cloning this in NewMVCCIterator(). We've
                // just created it, so cloning it would just be overhead.
        }
        return nil
}

// ClearMVCCIteratorRange implements the Batch interface.
func (p *pebbleBatch) ClearMVCCIteratorRange(
        start, end roachpb.Key, pointKeys, rangeKeys bool,
) error {
        clearPointKeys := func(start, end roachpb.Key) error {
                iter, err := p.NewMVCCIterator(context.Background(), MVCCKeyAndIntentsIterKind, IterOptions{
                        KeyTypes:   IterKeyTypePointsOnly,
                        LowerBound: start,
                        UpperBound: end,
                })
                if err != nil {
                        return err
                }
                defer iter.Close()
                for iter.SeekGE(MVCCKey{Key: start}); ; iter.Next() {
                        if valid, err := iter.Valid(); err != nil {
                                return err
                        } else if !valid {
                                break
                        }
                        // NB: UnsafeRawKey could be a serialized lock table key, and not just an
                        // MVCCKey.
                        if err := p.batch.Delete(iter.UnsafeRawKey(), nil); err != nil {
                                return err
                        }
                }
                return nil
        }

        if pointKeys {
                if err := clearPointKeys(start, end); err != nil {
                        return err
                }
        }

        clearRangeKeys := func(start, end roachpb.Key) error {
                iter, err := p.NewMVCCIterator(context.Background(), MVCCKeyIterKind, IterOptions{
                        KeyTypes:   IterKeyTypeRangesOnly,
                        LowerBound: start,
                        UpperBound: end,
                })
                if err != nil {
                        return err
                }
                defer iter.Close()
                for iter.SeekGE(MVCCKey{Key: start}); ; iter.Next() {
                        if valid, err := iter.Valid(); err != nil {
                                return err
                        } else if !valid {
                                break
                        }
                        // TODO(erikgrinaker): We should consider reusing a buffer for the
                        // encoding here, but we don't expect to see many range keys.
                        rangeKeys := iter.RangeKeys()
                        startRaw := EncodeMVCCKey(MVCCKey{Key: rangeKeys.Bounds.Key})
                        endRaw := EncodeMVCCKey(MVCCKey{Key: rangeKeys.Bounds.EndKey})
                        for _, v := range rangeKeys.Versions {
                                if err := p.batch.RangeKeyUnset(startRaw, endRaw, v.EncodedTimestampSuffix, nil); err != nil {
                                        return err
                                }
                        }
                }
                return nil
        }

        if rangeKeys {
                if err := clearRangeKeys(start, end); err != nil {
                        return err
                }
        }
        return nil
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"
        "math"
        "sync"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter"
        "github.com/cockroachdb/cockroach/pkg/util"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
        "github.com/cockroachdb/pebble/sstable"
)

// pebbleIterator is a wrapper around a pebble.Iterator that implements the
// MVCCIterator and EngineIterator interfaces. A single pebbleIterator
// should only be used in one of the two modes.
type pebbleIterator struct {
        // Underlying iterator for the DB.
        iter    pebbleiter.Iterator
        options pebble.IterOptions
        // Reusable buffer for MVCCKey or EngineKey encoding.
        keyBuf []byte
        // Buffers for copying iterator options to. Note that the underlying memory
        // is not GCed upon Close(), to reduce the number of overall allocations.
        lowerBoundBuf      []byte
        upperBoundBuf      []byte
        rangeKeyMaskingBuf []byte
        // Filter to use if masking is enabled.
        maskFilter mvccWallTimeIntervalRangeKeyMask
        // [minTimestamp,maxTimestamp] contain the encoded timestamp bounds of the
        // iterator, if any. This iterator will not return keys outside these
        // timestamps. These are encoded because lexicographic comparison on encoded
        // timestamps is equivalent to the comparison on decoded timestamps. These
        // timestamps are enforced through the IterOptions.SkipPoint function, which
        // is provided with encoded keys.
        //
        // NB: minTimestamp and maxTimestamp are both inclusive.
        minTimestamp []byte // inclusive
        maxTimestamp []byte // inclusive

        // Buffer used to store MVCCRangeKeyVersions returned by RangeKeys(). Lazily
        // initialized the first time an iterator's RangeKeys() method is called.
        mvccRangeKeyVersions []MVCCRangeKeyVersion

        // parent is a pointer to the Engine from which the iterator was constructed.
        parent *Pebble

        // Set to true to govern whether to call SeekPrefixGE or SeekGE. Skips
        // SSTables based on MVCC/Engine key when true.
        prefix bool
        // If reusable is true, Close() does not actually close the underlying
        // iterator, but simply marks it as not inuse. Used by pebbleReadOnly.
        reusable bool
        inuse    bool
        // Set to true if the underlying Pebble Iterator was created through
        // pebble.NewExternalIter, and so the iterator is iterating over files
        // external to the storage engine. This is used to avoid panicking on
        // corruption errors that should be non-fatal if encountered from external
        // sources of sstables.
        external bool
        // mvccDirIsReverse and mvccDone are used only for the methods implementing
        // MVCCIterator. They are used to prevent the iterator from iterating into
        // the lock table key space.
        //
        // The current direction. false for forward, true for reverse.
        mvccDirIsReverse bool
        // True iff the iterator is exhausted in the current direction. There is
        // no error to report when it is true.
        mvccDone bool
}

var _ MVCCIterator = &pebbleIterator{}
var _ EngineIterator = &pebbleIterator{}

var pebbleIterPool = sync.Pool{
        New: func() interface{} {
                return &pebbleIterator{}
        },
}

// newPebbleIterator creates a new Pebble iterator for the given Pebble reader.
func newPebbleIterator(
        ctx context.Context,
        handle pebble.Reader,
        opts IterOptions,
        durability DurabilityRequirement,
        parent *Pebble,
) (*pebbleIterator, error) {
        p := pebbleIterPool.Get().(*pebbleIterator)
        p.reusable = false // defensive
        p.init(ctx, nil, opts, durability, parent)
        iter, err := handle.NewIterWithContext(ctx, &p.options)
        if err != nil {
                return nil, err
        }
        p.iter = pebbleiter.MaybeWrap(iter)
        return p, nil
}

// newPebbleIteratorByCloning creates a new Pebble iterator by cloning the given
// iterator and reconfiguring it.
func newPebbleIteratorByCloning(
        ctx context.Context, cloneCtx CloneContext, opts IterOptions, durability DurabilityRequirement,
) *pebbleIterator {
        var err error
        p := pebbleIterPool.Get().(*pebbleIterator)
        p.reusable = false // defensive
        p.init(ctx, nil, opts, durability, cloneCtx.engine)
        p.iter, err = cloneCtx.rawIter.CloneWithContext(ctx, pebble.CloneOptions{
                IterOptions:      &p.options,
                RefreshBatchView: true,
        })
        if err != nil {
                p.Close()
                panic(err)
        }
        return p
}

// newPebbleSSTIterator creates a new Pebble iterator for the given SSTs.
func newPebbleSSTIterator(
        files [][]sstable.ReadableFile, opts IterOptions,
) (*pebbleIterator, error) {
        p := pebbleIterPool.Get().(*pebbleIterator)
        p.reusable = false // defensive
        p.init(context.Background(), nil, opts, StandardDurability, nil)

        iter, err := pebble.NewExternalIter(DefaultPebbleOptions(), &p.options, files)
        if err != nil {
                p.Close()
                return nil, err
        }
        p.iter = pebbleiter.MaybeWrap(iter)
        p.external = true
        return p, nil
}

// init resets this pebbleIterator for use with the specified arguments,
// reconfiguring the given iter. It is valid to pass a nil iter and then create
// p.iter using p.options, to avoid redundant reconfiguration via SetOptions().
func (p *pebbleIterator) init(
        ctx context.Context,
        iter pebbleiter.Iterator,
        opts IterOptions,
        durability DurabilityRequirement,
        statsReporter *Pebble,
) {
        *p = pebbleIterator{
                iter:               iter,
                keyBuf:             p.keyBuf,
                lowerBoundBuf:      p.lowerBoundBuf,
                upperBoundBuf:      p.upperBoundBuf,
                rangeKeyMaskingBuf: p.rangeKeyMaskingBuf,
                parent:             statsReporter,
                reusable:           p.reusable,
        }
        p.setOptions(ctx, opts, durability)
        p.inuse = true // after setOptions(), so panic won't cause reader to panic too
}

// initReuseOrCreate is a convenience method that (re-)initializes an existing
// pebbleIterator in one out of three ways:
//
// 1. iter != nil && !clone: use and reconfigure the given raw Pebble iterator.
// 2. iter != nil && clone: clone and reconfigure the given raw Pebble iterator.
// 3. iter == nil: create a new iterator from handle.
func (p *pebbleIterator) initReuseOrCreate(
        ctx context.Context,
        handle pebble.Reader,
        iter pebbleiter.Iterator,
        clone bool,
        opts IterOptions,
        durability DurabilityRequirement,
        statsReporter *Pebble,
) error {
        if iter != nil && !clone {
                p.init(ctx, iter, opts, durability, statsReporter)
                return nil
        }

        p.init(ctx, nil, opts, durability, statsReporter)
        if iter == nil {
                // TODO(sumeer): fix after bumping to latest Pebble.
                innerIter, err := handle.NewIterWithContext(ctx, &p.options)
                if err != nil {
                        return err
                }
                p.iter = pebbleiter.MaybeWrap(innerIter)
        } else if clone {
                var err error
                p.iter, err = iter.CloneWithContext(ctx, pebble.CloneOptions{
                        IterOptions:      &p.options,
                        RefreshBatchView: true,
                })
                if err != nil {
                        p.Close()
                        return err
                }
        }
        return nil
}

// setOptions updates the options for a pebbleIterator. If p.iter is non-nil, it
// updates the options on the existing iterator too, and set the context.
func (p *pebbleIterator) setOptions(
        ctx context.Context, opts IterOptions, durability DurabilityRequirement,
) {
        if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 {
                panic("iterator must set prefix or upper bound or lower bound")
        }
        if opts.MinTimestamp.IsSet() && opts.MaxTimestamp.IsEmpty() {
                panic("min timestamp hint set without max timestamp hint")
        }
        if opts.Prefix && opts.RangeKeyMaskingBelow.IsSet() {
                panic("can't use range key masking with prefix iterators") // very high overhead
        }

        // Generate new Pebble iterator options.
        p.options = pebble.IterOptions{
                OnlyReadGuaranteedDurable: durability == GuaranteedDurability,
                KeyTypes:                  opts.KeyTypes,
                UseL6Filters:              opts.useL6Filters,
                Category:                  opts.ReadCategory.PebbleCategory(),
        }
        p.prefix = opts.Prefix

        if opts.LowerBound != nil {
                // This is the same as
                // p.options.LowerBound = EncodeKeyToBuf(p.lowerBoundBuf[0][:0], MVCCKey{Key: opts.LowerBound})
                // or EngineKey{Key: opts.LowerBound}.EncodeToBuf(...).
                // Since we are encoding keys with an empty version anyway, we can just
                // append the NUL byte instead of calling the above encode functions which
                // will do the same thing.
                p.lowerBoundBuf = append(p.lowerBoundBuf[:0], opts.LowerBound...)
                p.lowerBoundBuf = append(p.lowerBoundBuf, 0x00)
                p.options.LowerBound = p.lowerBoundBuf
        }
        if opts.UpperBound != nil {
                // Same as above.
                p.upperBoundBuf = append(p.upperBoundBuf[:0], opts.UpperBound...)
                p.upperBoundBuf = append(p.upperBoundBuf, 0x00)
                p.options.UpperBound = p.upperBoundBuf
        }
        if opts.RangeKeyMaskingBelow.IsSet() {
                p.rangeKeyMaskingBuf = encodeMVCCTimestampSuffixToBuf(
                        p.rangeKeyMaskingBuf, opts.RangeKeyMaskingBelow)
                p.options.RangeKeyMasking.Suffix = p.rangeKeyMaskingBuf
                p.maskFilter.BlockIntervalFilter.Init(mvccWallTimeIntervalCollector, 0, math.MaxUint64, MVCCBlockIntervalSuffixReplacer{})
                p.options.RangeKeyMasking.Filter = p.getBlockPropertyFilterMask
        }

        if opts.MaxTimestamp.IsSet() {
                // Install an IterOptions.SkipPoint function to ensure that we skip over
                // any keys outside the the time bounds that don't get excluded by the
                // coarse, opportunistic block-property filters. To avoid decoding
                // per-KV, the SkipPoint function performs lexicographic comparisons on
                // encoded timestamps, which is equivalent to the decoded, logical
                // comparisons when ignoring the synthetic bit. In lexicographic order,
                // the encoded key with the synthetic bit set sorts after the same
                // timestamp without the synthetic bit. Timestamps differing only in the
                // synthetic bit should otherwise be equal, so we take care to construct
                // a minimum bound without the bit and a maximum bound with the bit to
                // be inclusive on both ends.
                p.minTimestamp = encodeMVCCTimestamp(hlc.Timestamp{
                        WallTime: opts.MinTimestamp.WallTime,
                        Logical:  opts.MinTimestamp.Logical,
                })
                p.maxTimestamp = append(encodeMVCCTimestamp(hlc.Timestamp{
                        WallTime: opts.MaxTimestamp.WallTime,
                        Logical:  opts.MaxTimestamp.Logical,
                }), 0x01 /* Synthetic bit */)
                p.options.SkipPoint = p.skipPointIfOutsideTimeBounds

                // We are given an inclusive [MinTimestamp, MaxTimestamp]. The
                // MVCCWAllTimeIntervalCollector has collected the WallTimes and we need
                // [min, max), i.e., exclusive on the upper bound.
                //
                // NB: PointKeyFilters documents that when set to non-empty, the capacity
                // of the slice should be at least one more than the length, for a
                // Pebble-internal performance optimization.
                pkf := [2]pebble.BlockPropertyFilter{
                        sstable.NewBlockIntervalFilter(mvccWallTimeIntervalCollector,
                                uint64(opts.MinTimestamp.WallTime),
                                uint64(opts.MaxTimestamp.WallTime)+1,
                                MVCCBlockIntervalSuffixReplacer{},
                        ),
                }
                p.options.PointKeyFilters = pkf[:1:2]
                // NB: We disable range key block filtering because of complications in
                // MVCCIncrementalIterator.maybeSkipKeys: the TBI may see different range
                // key fragmentation than the main iterator due to the filtering. This would
                // necessitate additional seeks/processing that likely negate the marginal
                // benefit of the range key filters. See:
                // https://github.com/cockroachdb/cockroach/issues/86260.
                //
                // However, we do collect block properties for range keys, in case we enable
                // this later.
                p.options.RangeKeyFilters = nil
        }

        // Set the new iterator options. We unconditionally do so, since Pebble will
        // optimize noop changes as needed, and it may affect batch write visibility.
        if p.iter != nil {
                p.iter.SetContext(ctx)
                p.iter.SetOptions(&p.options)
        }
}

// Close implements the MVCCIterator interface.
func (p *pebbleIterator) Close() {
        if !p.inuse {
                panic("closing idle iterator")
        }
        p.inuse = false

        // Report the iterator's stats so they can be accumulated and exposed
        // through time-series metrics.
        if p.iter != nil && p.parent != nil {
                p.parent.aggregateIterStats(p.Stats())
        }

        if p.reusable {
                p.iter.ResetStats()
                return
        }

        p.destroy()

        pebbleIterPool.Put(p)
}

// SeekGE implements the MVCCIterator interface.
func (p *pebbleIterator) SeekGE(key MVCCKey) {
        p.mvccDirIsReverse = false
        p.mvccDone = false
        p.keyBuf = EncodeMVCCKeyToBuf(p.keyBuf[:0], key)
        if p.prefix {
                p.iter.SeekPrefixGE(p.keyBuf)
        } else {
                p.iter.SeekGE(p.keyBuf)
        }
}

// SeekEngineKeyGE implements the EngineIterator interface.
func (p *pebbleIterator) SeekEngineKeyGE(key EngineKey) (valid bool, err error) {
        p.keyBuf = key.EncodeToBuf(p.keyBuf[:0])
        var ok bool
        if p.prefix {
                ok = p.iter.SeekPrefixGE(p.keyBuf)
        } else {
                ok = p.iter.SeekGE(p.keyBuf)
        }
        // NB: A Pebble Iterator always returns ok==false when an error is
        // present.
        if ok {
                return true, nil
        }
        return false, p.iter.Error()
}

func (p *pebbleIterator) SeekEngineKeyGEWithLimit(
        key EngineKey, limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
        p.keyBuf = key.EncodeToBuf(p.keyBuf[:0])
        if limit != nil {
                if p.prefix {
                        panic("prefix iteration does not permit a limit")
                }
                // Append the sentinel byte to make an EngineKey that has an empty
                // version.
                limit = append(limit, '\x00')
        }
        if p.prefix {
                state = pebble.IterExhausted
                if p.iter.SeekPrefixGE(p.keyBuf) {
                        state = pebble.IterValid
                }
        } else {
                state = p.iter.SeekGEWithLimit(p.keyBuf, limit)
        }
        if state == pebble.IterExhausted {
                return state, p.iter.Error()
        }
        return state, nil
}

// Valid implements the MVCCIterator interface. Must not be called from
// methods of EngineIterator.
func (p *pebbleIterator) Valid() (bool, error) {
        if p.mvccDone {
                return false, nil
        }
        // NB: A Pebble Iterator always returns Valid()==false when an error is
        // present. If Valid() is true, there is no error.
        if !p.iter.Valid() {
                return false, p.iter.Error()
        }

        // The MVCCIterator interface is broken in that it silently discards the
        // error when UnsafeKey() is unable to parse the key as an MVCCKey. This is
        // especially problematic if the caller is accidentally iterating into the
        // lock table key space, since that parsing will fail. We do a cheap check
        // here to make sure we are not in the lock table key space.
        //
        // TODO(sumeer): fix this properly by changing those method signatures.
        k := p.iter.Key()
        if len(k) == 0 {
                return false, errors.Errorf("iterator encountered 0 length key")
        }
        // Last byte is the version length + 1 or 0.
        versionLen := int(k[len(k)-1])
        if versionLen == engineKeyVersionLockTableLen+1 {
                p.mvccDone = true
                return false, nil
        }

        if util.RaceEnabled {
                if err := p.assertMVCCInvariants(); err != nil {
                        return false, err
                }
        }
        return true, nil
}

// Next implements the MVCCIterator interface.
func (p *pebbleIterator) Next() {
        if p.mvccDirIsReverse {
                // Switching directions.
                p.mvccDirIsReverse = false
                p.mvccDone = false
        }
        if p.mvccDone {
                return
        }
        p.iter.Next()
}

// NextEngineKey implements the Engineterator interface.
func (p *pebbleIterator) NextEngineKey() (valid bool, err error) {
        ok := p.iter.Next()
        // NB: A Pebble Iterator always returns ok==false when an error is
        // present.
        if ok {
                return true, nil
        }
        return false, p.iter.Error()
}

func (p *pebbleIterator) NextEngineKeyWithLimit(
        limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
        if limit != nil {
                // Append the sentinel byte to make an EngineKey that has an empty
                // version.
                limit = append(limit, '\x00')
        }
        state = p.iter.NextWithLimit(limit)
        if state == pebble.IterExhausted {
                return state, p.iter.Error()
        }
        return state, nil
}

// NextKey implements the MVCCIterator interface.
func (p *pebbleIterator) NextKey() {
        if p.mvccDirIsReverse {
                // Switching directions.
                p.mvccDirIsReverse = false
                p.mvccDone = false
        }
        if p.mvccDone {
                return
        }
        if valid, err := p.Valid(); err != nil || !valid {
                return
        }

        // NB: If p.prefix, iterators can't move onto a separate key by definition,
        // so the below call to NextPrefix will exhaust the iterator.
        p.iter.NextPrefix()
}

// UnsafeKey implements the MVCCIterator interface.
func (p *pebbleIterator) UnsafeKey() MVCCKey {
        mvccKey, err := DecodeMVCCKey(p.iter.Key())
        if err != nil {
                return MVCCKey{}
        }
        return mvccKey
}

// UnsafeEngineKey implements the EngineIterator interface.
func (p *pebbleIterator) UnsafeEngineKey() (EngineKey, error) {
        engineKey, ok := DecodeEngineKey(p.iter.Key())
        if !ok {
                return engineKey, errors.Errorf("invalid encoded engine key: %x", p.iter.Key())
        }
        return engineKey, nil
}

// UnsafeRawKey returns the raw key from the underlying pebble.Iterator.
func (p *pebbleIterator) UnsafeRawKey() []byte {
        return p.iter.Key()
}

// UnsafeRawMVCCKey implements the MVCCIterator interface.
func (p *pebbleIterator) UnsafeRawMVCCKey() []byte {
        return p.iter.Key()
}

// UnsafeRawEngineKey implements the EngineIterator interface.
func (p *pebbleIterator) UnsafeRawEngineKey() []byte {
        return p.iter.Key()
}

// UnsafeValue implements the MVCCIterator and EngineIterator interfaces.
func (p *pebbleIterator) UnsafeValue() ([]byte, error) {
        if ok := p.iter.Valid(); !ok {
                return nil, nil
        }
        return p.iter.ValueAndErr()
}

// UnsafeLazyValue implements the MVCCIterator and EngineIterator interfaces.
func (p *pebbleIterator) UnsafeLazyValue() pebble.LazyValue {
        if ok := p.iter.Valid(); !ok {
                panic(errors.AssertionFailedf("UnsafeLazyValue called on !Valid iterator"))
        }
        return p.iter.LazyValue()
}

// MVCCValueLenAndIsTombstone implements the MVCCIterator interface.
func (p *pebbleIterator) MVCCValueLenAndIsTombstone() (int, bool, error) {
        lv := p.iter.LazyValue()
        attr, ok := lv.TryGetShortAttribute()
        var isTombstone bool
        var valLen int
        if ok {
                isTombstone = attr != 0
                valLen = lv.Len()
        } else {
                // Must be an in-place value, since it did not have a short attribute.
                val := lv.InPlaceValue()
                var err error
                isTombstone, err = EncodedMVCCValueIsTombstone(val)
                if err != nil {
                        return 0, false, err
                }
                valLen = len(val)
        }
        return valLen, isTombstone, nil
}

// ValueLen implements the MVCCIterator interface.
func (p *pebbleIterator) ValueLen() int {
        lv := p.iter.LazyValue()
        return lv.Len()
}

// SeekLT implements the MVCCIterator interface.
func (p *pebbleIterator) SeekLT(key MVCCKey) {
        p.mvccDirIsReverse = true
        p.mvccDone = false
        p.keyBuf = EncodeMVCCKeyToBuf(p.keyBuf[:0], key)
        p.iter.SeekLT(p.keyBuf)
}

// SeekEngineKeyLT implements the EngineIterator interface.
func (p *pebbleIterator) SeekEngineKeyLT(key EngineKey) (valid bool, err error) {
        p.keyBuf = key.EncodeToBuf(p.keyBuf[:0])
        ok := p.iter.SeekLT(p.keyBuf)
        // NB: A Pebble Iterator always returns ok==false when an error is
        // present.
        if ok {
                return true, nil
        }
        return false, p.iter.Error()
}

func (p *pebbleIterator) SeekEngineKeyLTWithLimit(
        key EngineKey, limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
        p.keyBuf = key.EncodeToBuf(p.keyBuf[:0])
        if limit != nil {
                // Append the sentinel byte to make an EngineKey that has an empty
                // version.
                limit = append(limit, '\x00')
        }
        state = p.iter.SeekLTWithLimit(p.keyBuf, limit)
        if state == pebble.IterExhausted {
                return state, p.iter.Error()
        }
        return state, nil
}

// Prev implements the MVCCIterator interface.
func (p *pebbleIterator) Prev() {
        if !p.mvccDirIsReverse {
                // Switching directions.
                p.mvccDirIsReverse = true
                p.mvccDone = false
        }
        if p.mvccDone {
                return
        }
        p.iter.Prev()
}

// PrevEngineKey implements the EngineIterator interface.
func (p *pebbleIterator) PrevEngineKey() (valid bool, err error) {
        ok := p.iter.Prev()
        // NB: A Pebble Iterator always returns ok==false when an error is
        // present.
        if ok {
                return true, nil
        }
        return false, p.iter.Error()
}

func (p *pebbleIterator) PrevEngineKeyWithLimit(
        limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
        if limit != nil {
                // Append the sentinel byte to make an EngineKey that has an empty
                // version.
                limit = append(limit, '\x00')
        }
        state = p.iter.PrevWithLimit(limit)
        if state == pebble.IterExhausted {
                return state, p.iter.Error()
        }
        return state, nil
}

// EngineKey implements the EngineIterator interface.
func (p *pebbleIterator) EngineKey() (EngineKey, error) {
        key, err := p.UnsafeEngineKey()
        if err != nil {
                return key, err
        }
        return key.Copy(), nil
}

// Value implements the MVCCIterator and EngineIterator interfaces.
func (p *pebbleIterator) Value() ([]byte, error) {
        value, err := p.UnsafeValue()
        if err != nil {
                return nil, err
        }
        valueCopy := make([]byte, len(value))
        copy(valueCopy, value)
        return valueCopy, nil
}

// ValueProto implements the MVCCIterator interface.
func (p *pebbleIterator) ValueProto(msg protoutil.Message) error {
        value, err := p.UnsafeValue()
        if err != nil {
                return err
        }
        return protoutil.Unmarshal(value, msg)
}

// HasPointAndRange implements the MVCCIterator interface.
func (p *pebbleIterator) HasPointAndRange() (bool, bool) {
        return p.iter.HasPointAndRange()
}

// RangeBounds implements the MVCCIterator interface.
func (p *pebbleIterator) RangeBounds() roachpb.Span {
        start, end := p.iter.RangeBounds()

        // Avoid decoding empty keys: DecodeMVCCKey() will return errors for these,
        // which are expensive to construct.
        if len(start) == 0 && len(end) == 0 {
                return roachpb.Span{}
        }

        // TODO(erikgrinaker): We should surface these errors somehow, but for now we
        // follow UnsafeKey()'s example and silently return empty bounds.
        startKey, err := DecodeMVCCKey(start)
        if err != nil {
                return roachpb.Span{}
        }
        endKey, err := DecodeMVCCKey(end)
        if err != nil {
                return roachpb.Span{}
        }

        return roachpb.Span{Key: startKey.Key, EndKey: endKey.Key}
}

// EngineRangeBounds implements the EngineIterator interface.
func (p *pebbleIterator) EngineRangeBounds() (roachpb.Span, error) {
        start, end := p.iter.RangeBounds()
        if len(start) == 0 && len(end) == 0 {
                return roachpb.Span{}, nil
        }

        s, ok := DecodeEngineKey(start)
        if !ok || len(s.Version) > 0 {
                return roachpb.Span{}, errors.Errorf("invalid encoded engine key: %x", start)
        }
        e, ok := DecodeEngineKey(end)
        if !ok || len(e.Version) > 0 {
                return roachpb.Span{}, errors.Errorf("invalid encoded engine key: %x", end)
        }
        return roachpb.Span{Key: s.Key, EndKey: e.Key}, nil
}

// RangeKeys implements the MVCCIterator interface.
func (p *pebbleIterator) RangeKeys() MVCCRangeKeyStack {
        rangeKeys := p.iter.RangeKeys()
        stack := MVCCRangeKeyStack{
                Bounds:   p.RangeBounds(),
                Versions: p.mvccRangeKeyVersions[:0],
        }
        if cap(stack.Versions) < len(rangeKeys) {
                stack.Versions = make(MVCCRangeKeyVersions, 0, len(rangeKeys))
                p.mvccRangeKeyVersions = stack.Versions
        }

        for _, rangeKey := range rangeKeys {
                timestamp, err := DecodeMVCCTimestampSuffix(rangeKey.Suffix)
                if err != nil {
                        // TODO(erikgrinaker): We should surface this error somehow, but for now
                        // we follow UnsafeKey()'s example and silently skip them.
                        continue
                }
                stack.Versions = append(stack.Versions, MVCCRangeKeyVersion{
                        Timestamp:              timestamp,
                        Value:                  rangeKey.Value,
                        EncodedTimestampSuffix: rangeKey.Suffix,
                })
        }
        return stack
}

// RangeKeyChanged implements the MVCCIterator interface.
func (p *pebbleIterator) RangeKeyChanged() bool {
        return p.iter.RangeKeyChanged()
}

// EngineRangeKeys implements the EngineIterator interface.
func (p *pebbleIterator) EngineRangeKeys() []EngineRangeKeyValue {
        rangeKeys := p.iter.RangeKeys()
        rkvs := make([]EngineRangeKeyValue, 0, len(rangeKeys))
        for _, rk := range rangeKeys {
                rkvs = append(rkvs, EngineRangeKeyValue{Version: rk.Suffix, Value: rk.Value})
        }
        return rkvs
}

// Go-only version of IsValidSplitKey. Checks if the specified key is in
// NoSplitSpans.
func isValidSplitKey(key roachpb.Key, noSplitSpans []roachpb.Span) bool {
        if key.Equal(keys.Meta2KeyMax) {
                // We do not allow splits at Meta2KeyMax. The reason for this is that range
                // descriptors are stored at RangeMetaKey(range.EndKey), so the new range
                // that ends at Meta2KeyMax would naturally store its descriptor at
                // RangeMetaKey(Meta2KeyMax) = Meta1KeyMax. However, Meta1KeyMax already
                // serves a different role of holding a second copy of the descriptor for
                // the range that spans the meta2/userspace boundary (see case 3a in
                // rangeAddressing). If we allowed splits at Meta2KeyMax, the two roles
                // would overlap. See #1206.
                return false
        }
        for i := range noSplitSpans {
                if noSplitSpans[i].ProperlyContainsKey(key) {
                        return false
                }
        }
        return true
}

// IsValidSplitKey returns whether the key is a valid split key. Adapter for
// the method above, for use from other packages.
func IsValidSplitKey(key roachpb.Key) bool {
        return isValidSplitKey(key, keys.NoSplitSpans)
}

// FindSplitKey implements the MVCCIterator interface.
func (p *pebbleIterator) FindSplitKey(
        start, end, minSplitKey roachpb.Key, targetSize int64,
) (MVCCKey, error) {
        return findSplitKeyUsingIterator(p, start, end, minSplitKey, targetSize)
}

func findSplitKeyUsingIterator(
        iter MVCCIterator, start, end, minSplitKey roachpb.Key, targetSize int64,
) (MVCCKey, error) {
        const timestampLen = 12

        sizeSoFar := int64(0)
        bestDiff := int64(math.MaxInt64)
        bestSplitKey := MVCCKey{}
        // found indicates that we have found a valid split key that is the best
        // known so far. If bestSplitKey is empty => that split key
        // is in prevKey, else it is in bestSplitKey.
        found := false
        prevKey := MVCCKey{}

        // We only have to consider no-split spans if our minimum split key possibly
        // lies before them. Note that the no-split spans are ordered by end-key.
        var noSplitSpans []roachpb.Span
        for i := range keys.NoSplitSpans {
                if minSplitKey.Compare(keys.NoSplitSpans[i].EndKey) <= 0 {
                        noSplitSpans = keys.NoSplitSpans[i:]
                        break
                }
        }

        // Note that it is unnecessary to compare against "end" to decide to
        // terminate iteration because the iterator's upper bound has already been
        // set to end.
        mvccMinSplitKey := MakeMVCCMetadataKey(minSplitKey)
        iter.SeekGE(MakeMVCCMetadataKey(start))
        for ; ; iter.Next() {
                valid, err := iter.Valid()
                if err != nil {
                        return MVCCKey{}, err
                }
                if !valid {
                        break
                }
                mvccKey := iter.UnsafeKey()

                diff := targetSize - sizeSoFar
                if diff < 0 {
                        diff = -diff
                }
                if diff > bestDiff {
                        // diff will keep increasing past this point. And we must have had a valid
                        // candidate in the past since we can't be worse than MaxInt64.
                        break
                }

                if mvccMinSplitKey.Key != nil && !mvccKey.Less(mvccMinSplitKey) {
                        // mvccKey is >= mvccMinSplitKey. Set the minSplitKey to nil so we do
                        // not have to make any more checks going forward.
                        mvccMinSplitKey.Key = nil
                }

                if mvccMinSplitKey.Key == nil && diff < bestDiff &&
                        (len(noSplitSpans) == 0 || isValidSplitKey(mvccKey.Key, noSplitSpans)) {
                        // This is a valid candidate for a split key.
                        //
                        // Instead of copying bestSplitKey just yet, flip the found flag. In the
                        // most common case where the actual best split key is followed by a key
                        // that has diff > bestDiff (see the if statement with that predicate
                        // above), this lets us save a copy by reusing prevCandidateKey as the
                        // best split key.
                        bestDiff = diff
                        found = true
                        // Set length of bestSplitKey to 0, which the rest of this method relies
                        // on to check if the last key encountered was the best split key.
                        bestSplitKey.Key = bestSplitKey.Key[:0]
                } else if found && len(bestSplitKey.Key) == 0 {
                        // We were just at a valid split key candidate, but then we came across
                        // a key that cannot be a split key (i.e. is in noSplitSpans), or was not
                        // an improvement over bestDiff. Copy the previous key as the
                        // bestSplitKey.
                        bestSplitKey.Timestamp = prevKey.Timestamp
                        bestSplitKey.Key = append(bestSplitKey.Key[:0], prevKey.Key...)
                }

                sizeSoFar += int64(iter.ValueLen())
                if mvccKey.IsValue() && bytes.Equal(prevKey.Key, mvccKey.Key) {
                        // We only advanced timestamps, but not new mvcc keys.
                        sizeSoFar += timestampLen
                } else {
                        sizeSoFar += int64(len(mvccKey.Key) + 1)
                        if mvccKey.IsValue() {
                                sizeSoFar += timestampLen
                        }
                }

                prevKey.Key = append(prevKey.Key[:0], mvccKey.Key...)
                prevKey.Timestamp = mvccKey.Timestamp
        }

        // There are three distinct types of cases possible here:
        //
        // 1. No valid split key was found (found == false), in which case we return
        //    bestSplitKey (which should be MVCCKey{}).
        // 2. The best candidate seen for a split key so far was encountered in the
        //    last iteration of the above loop. We broke out of the loop either due
        //    to iterator exhaustion (!p.iter.Valid()), or an increasing diff. Return
        //    prevKey as the best split key.
        // 3. The best split key was seen multiple iterations ago, and was copied into
        //    bestSplitKey at some point (found == true, len(bestSplitKey.Key) > 0).
        //    Keys encountered after that point were invalid for being in noSplitSpans
        //    so return the bestSplitKey that had been copied.
        //
        // This if statement checks for case 2.
        if found && len(bestSplitKey.Key) == 0 {
                // Use the last key found as the best split key, since we broke out of the
                // loop (due to iterator exhaustion or increasing diff) right after we saw
                // the best split key. prevKey has to be a valid split key since the only
                // way we'd have both found && len(bestSplitKey.Key) == 0 is when we've
                // already checked prevKey for validity.
                return prevKey, nil
        }
        return bestSplitKey, nil
}

// Stats implements the {MVCCIterator,EngineIterator} interfaces.
func (p *pebbleIterator) Stats() IteratorStats {
        return IteratorStats{
                Stats: p.iter.Stats(),
        }
}

// IsPrefix implements the MVCCIterator interface.
func (p *pebbleIterator) IsPrefix() bool {
        return p.prefix
}

// CloneContext is part of the EngineIterator interface.
func (p *pebbleIterator) CloneContext() CloneContext {
        return CloneContext{rawIter: p.iter, engine: p.parent}
}

func (p *pebbleIterator) getBlockPropertyFilterMask() pebble.BlockPropertyFilterMask {
        return &p.maskFilter
}

func (p *pebbleIterator) skipPointIfOutsideTimeBounds(key []byte) (skip bool) {
        if len(key) == 0 {
                return false
        }
        // Last byte is the version length + 1 when there is a version,
        // else it is 0.
        versionLen := int(key[len(key)-1])
        if versionLen == 0 {
                // This is not an MVCC key.
                return false
        }
        // prefixPartEnd points to the sentinel byte, unless this is a bare suffix, in
        // which case the index is -1.
        prefixPartEnd := len(key) - 1 - versionLen
        // Sanity check: the index should be >= -1. Additionally, if the index is >=
        // 0, it should point to the sentinel byte, as this is a full EngineKey. If
        // the key appears invalid and we don't understand it, don't skip it so the
        // iterator will observe it and hopefully propagate an error up the stack.
        if prefixPartEnd < -1 || (prefixPartEnd >= 0 && key[prefixPartEnd] != sentinel) {
                return false
        }

        switch versionLen - 1 {
        case engineKeyVersionWallTimeLen, engineKeyVersionWallAndLogicalTimeLen, engineKeyVersionWallLogicalAndSyntheticTimeLen:
                // INVARIANT: -1 <= prefixPartEnd < len(b) - 1.
                // Version consists of the bytes after the sentinel and before the length.
                ts := key[prefixPartEnd+1 : len(key)-1]
                // Lexicographic comparison on the encoded timestamps is equivalent to the
                // comparison on decoded timestamps, so we avoid the need to decode the
                // walltimes by performing simple byte comarisons.
                if bytes.Compare(ts, p.minTimestamp) < 0 {
                        return true
                }
                if bytes.Compare(ts, p.maxTimestamp) > 0 {
                        return true
                }
                // minTimestamp ≤ ts ≤ maxTimestamp
                //
                // The key's timestamp is within the iterator's configured bounds.
                return false
        default:
                // Not a MVCC key.
                return false
        }

}

func (p *pebbleIterator) destroy() {
        if p.inuse {
                panic("iterator still in use")
        }
        if p.iter != nil {
                // If an error is encountered during iteration, it'll already have been
                // surfaced by p.iter.Error() through Valid()'s error return value.
                // Closing a pebble iterator that's in an error state surfaces that same
                // error again. The client should've already handled the error when
                // surfaced through Valid(), but wants to close the iterator (eg,
                // potentially through a defer) and so we don't want to re-surface the
                // error.
                //
                // TODO(jackson): In addition to errors accumulated during iteration, Close
                // also returns errors encountered during the act of closing the iterator.
                // Currently, most of these errors are swallowed. The error returned by
                // iter.Close() may be an ephemeral error, or it may a misuse of the
                // Iterator or corruption. Only swallow ephemeral errors (eg,
                // DeadlineExceeded, etc), panic-ing on Close errors that are not known to
                // be ephemeral/retriable. While these ephemeral error types are enumerated,
                // we panic on the error types we know to be NOT ephemeral.
                //
                // See cockroachdb/pebble#1811.
                //
                // NB: The panic is omitted if the error is encountered on an external
                // iterator which is iterating over uncommitted sstables.

                if err := p.iter.Close(); !p.external && errors.Is(err, pebble.ErrCorruption) {
                        if p.parent != nil {
                                p.parent.writePreventStartupFile(context.Background(), err)
                        }
                        panic(err)
                }
                p.iter = nil
        }
        // Reset all fields except for the key and option buffers. Holding onto their
        // underlying memory is more efficient to prevent extra allocations down the
        // line.
        *p = pebbleIterator{
                keyBuf:             p.keyBuf,
                lowerBoundBuf:      p.lowerBoundBuf,
                upperBoundBuf:      p.upperBoundBuf,
                rangeKeyMaskingBuf: p.rangeKeyMaskingBuf,
                reusable:           p.reusable,
        }
}

// assertMVCCInvariants asserts internal MVCC iterator invariants, returning an
// AssertionFailedf on any failures. It must be called on a valid iterator after
// a complete state transition.
func (p *pebbleIterator) assertMVCCInvariants() error {
        // Assert general MVCCIterator API invariants.
        if err := assertMVCCIteratorInvariants(p); err != nil {
                return err
        }

        // The underlying iterator must be valid, with !mvccDone.
        if !p.iter.Valid() {
                errMsg := p.iter.Error().Error()
                return errors.AssertionFailedf("underlying iter is invalid, with err=%s", errMsg)
        }
        if p.mvccDone {
                return errors.AssertionFailedf("valid iter with mvccDone set")
        }

        // The position must match the underlying iter.
        if key, iterKey := p.UnsafeKey(), p.iter.Key(); !bytes.Equal(EncodeMVCCKey(key), iterKey) {
                return errors.AssertionFailedf("UnsafeKey %s does not match iterator key %x", key, iterKey)
        }

        // The iterator must be marked as in use.
        if !p.inuse {
                return errors.AssertionFailedf("valid iter with inuse=false")
        }

        // Prefix must be exposed.
        if p.prefix != p.IsPrefix() {
                return errors.AssertionFailedf("IsPrefix() does not match prefix=%v", p.prefix)
        }

        return nil
}

// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"

        "github.com/cockroachdb/cockroach/pkg/util/log"
        "github.com/cockroachdb/pebble"
)

type pebbleLogger struct {
        ctx   context.Context
        depth int
}

var _ pebble.LoggerAndTracer = pebbleLogger{}

func (l pebbleLogger) Infof(format string, args ...interface{}) {
        log.Storage.InfofDepth(l.ctx, l.depth, format, args...)
}

func (l pebbleLogger) Fatalf(format string, args ...interface{}) {
        log.Storage.FatalfDepth(l.ctx, l.depth, format, args...)
}

func (l pebbleLogger) Errorf(format string, args ...interface{}) {
        log.Storage.ErrorfDepth(l.ctx, l.depth, format, args...)
}

// pebble.LoggerAndTracer does not expose verbosity levels in its logging
// interface, and Pebble logs go to a separate STORAGE channel.
//
// The tracing part of the interface is meant for user-facing activities, so
// in addition to outputting the event when tracing is enabled, we also log.
// The eventAlsoLogVerbosityLevel of 2 is chosen semi-arbitrarily since this
// is the only verbosity level in this file.
const eventAlsoLogVerbosityLevel = 2

func (l pebbleLogger) Eventf(ctx context.Context, format string, args ...interface{}) {
        log.VEventfDepth(ctx, l.depth, eventAlsoLogVerbosityLevel, format, args...)
}

func (l pebbleLogger) IsTracingEnabled(ctx context.Context) bool {
        return log.HasSpan(ctx) || log.ExpensiveLogEnabledVDepth(ctx, l.depth, eventAlsoLogVerbosityLevel)
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "io"
        "slices"
        "sort"
        "sync"

        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
        "github.com/gogo/protobuf/proto"
)

// sortAndDeduplicateRows sorts all the samples field of the time series data
// structure according to the samples' `Offset`s. At the same time, samples with
// duplicate offset values are removed - only the last sample with a given offset
// in the collection is retained.
func sortAndDeduplicateRows(ts *roachpb.InternalTimeSeriesData) {
        // In the common case, appending the newer entries to the older entries
        // will result in an already ordered result, and there will be one sample
        // per offset. Optimize for that case.
        isSortedUniq := true
        for i := 1; i < len(ts.Samples); i++ {
                if ts.Samples[i-1].Offset >= ts.Samples[i].Offset {
                        isSortedUniq = false
                        break
                }
        }
        if isSortedUniq {
                return
        }

        // Create an auxiliary array of array indexes, and sort that array according
        // to the corresponding offset value in the ts.Samples collection. This
        // yields the permutation of the current array indexes that will place the
        // samples into sorted order. In order to guarantee only the last sample with
        // a duplicated offset is retained, we must do a stable sort.
        sortedSrcIdxs := make([]int, len(ts.Samples))
        for i := range sortedSrcIdxs {
                sortedSrcIdxs[i] = i
        }
        sort.SliceStable(sortedSrcIdxs, func(i, j int) bool {
                return ts.Samples[sortedSrcIdxs[i]].Offset < ts.Samples[sortedSrcIdxs[j]].Offset
        })

        // Remove any duplicates from the permutation, keeping the *last* element
        // merged for any given offset.
        uniqSortedSrcIdxs := make([]int, 0, len(ts.Samples))
        for destIdx := range sortedSrcIdxs {
                if destIdx == len(sortedSrcIdxs)-1 || ts.Samples[sortedSrcIdxs[destIdx]].Offset != ts.Samples[sortedSrcIdxs[destIdx+1]].Offset {
                        uniqSortedSrcIdxs = append(uniqSortedSrcIdxs, sortedSrcIdxs[destIdx])
                }
        }

        origSamples := ts.Samples
        ts.Samples = make([]roachpb.InternalTimeSeriesSample, len(uniqSortedSrcIdxs))

        // Apply the permutation in the auxiliary array to all of the relevant column
        // arrays in the data set.
        for destIdx, srcIdx := range uniqSortedSrcIdxs {
                ts.Samples[destIdx] = origSamples[srcIdx]
        }
}

// sortAndDeduplicateColumns sorts all column fields of the time series data
// structure according to the timeseries's `Offset` column. At the same time,
// duplicate offset values are removed - only the last instance of an offset in
// the collection is retained.
func sortAndDeduplicateColumns(ts *roachpb.InternalTimeSeriesData) {
        // In the common case, appending the newer entries to the older entries
        // will result in an already ordered result with no duplicated offsets.
        // Optimize for that case.
        isSortedUniq := true
        for i := 1; i < len(ts.Offset); i++ {
                if ts.Offset[i-1] >= ts.Offset[i] {
                        isSortedUniq = false
                        break
                }
        }
        if isSortedUniq {
                return
        }

        // Create an auxiliary array of array indexes, and sort that array according
        // to the corresponding offset value in the `ts.Offset` collection. This yields
        // the permutation of the current array indexes that will place the offsets into
        // sorted order. In order to guarantee only the last column values corresponding
        // to a duplicated offset are retained, we must do a stable sort.
        sortedSrcIdxs := make([]int, len(ts.Offset))
        for i := range sortedSrcIdxs {
                sortedSrcIdxs[i] = i
        }
        sort.SliceStable(sortedSrcIdxs, func(i, j int) bool {
                return ts.Offset[sortedSrcIdxs[i]] < ts.Offset[sortedSrcIdxs[j]]
        })

        // Remove any duplicates from the permutation, keeping the *last* element
        // merged for any given offset.
        uniqSortedSrcIdxs := make([]int, 0, len(ts.Offset))
        for destIdx := range sortedSrcIdxs {
                if destIdx == len(sortedSrcIdxs)-1 || ts.Offset[sortedSrcIdxs[destIdx]] != ts.Offset[sortedSrcIdxs[destIdx+1]] {
                        uniqSortedSrcIdxs = append(uniqSortedSrcIdxs, sortedSrcIdxs[destIdx])
                }
        }

        origOffset, origLast, origCount, origSum, origMin, origMax, origFirst, origVariance :=
                ts.Offset, ts.Last, ts.Count, ts.Sum, ts.Min, ts.Max, ts.First, ts.Variance
        ts.Offset = make([]int32, len(uniqSortedSrcIdxs))
        ts.Last = make([]float64, len(uniqSortedSrcIdxs))
        // These columns are only present at resolutions generated as rollups. We
        // detect this by checking if there are any count columns present (the
        // choice of "count" is arbitrary, all of these columns will be present or
        // not).
        if len(origCount) > 0 {
                ts.Count = make([]uint32, len(uniqSortedSrcIdxs))
                ts.Sum = make([]float64, len(uniqSortedSrcIdxs))
                ts.Min = make([]float64, len(uniqSortedSrcIdxs))
                ts.Max = make([]float64, len(uniqSortedSrcIdxs))
                ts.First = make([]float64, len(uniqSortedSrcIdxs))
                ts.Variance = make([]float64, len(uniqSortedSrcIdxs))
        }

        // Apply the permutation in the auxiliary array to all of the relevant column
        // arrays in the data set.
        for destIdx, srcIdx := range uniqSortedSrcIdxs {
                ts.Offset[destIdx] = origOffset[srcIdx]
                ts.Last[destIdx] = origLast[srcIdx]

                if len(origCount) > 0 {
                        ts.Count[destIdx] = origCount[srcIdx]
                        ts.Sum[destIdx] = origSum[srcIdx]
                        ts.Min[destIdx] = origMin[srcIdx]
                        ts.Max[destIdx] = origMax[srcIdx]
                        ts.First[destIdx] = origFirst[srcIdx]
                        ts.Variance[destIdx] = origVariance[srcIdx]
                }
        }
}

// ensureColumnar detects time series data which is in the old row format,
// converting the row data into the new columnar format.
func ensureColumnar(ts *roachpb.InternalTimeSeriesData) {
        for _, sample := range ts.Samples {
                ts.Offset = append(ts.Offset, sample.Offset)
                ts.Last = append(ts.Last, sample.Sum)
        }
        ts.Samples = ts.Samples[:0]
}

// MVCCValueMerger implements the `ValueMerger` interface. It buffers
// deserialized values in a slice in order specified by `oldToNew`.
// It determines the order of incoming operands by whether they were added
// with `MergeNewer()` or `MergeOlder()`, reversing the slice as necessary
// to ensure operands are always appended. It merges these deserialized
// operands when `Finish()` is called.
//
// It supports merging either all `roachpb.InternalTimeSeriesData` values
// or all non-timeseries values. Attempting to merge a mixture of timeseries
// and non-timeseries values will result in an error.
type MVCCValueMerger struct {
        timeSeriesOps []roachpb.InternalTimeSeriesData
        rawByteOps    [][]byte
        oldestMergeTS hlc.LegacyTimestamp
        oldToNew      bool
        // inPool is set when we put this instance in the mvccValueMergerPool.
        inPool bool

        // Used to avoid heap allocations when passing pointer to `Unmarshal()`.
        meta enginepb.MVCCMetadata
        // merged and metaSubset are used to avoid heap allocations in Finish().
        merged     roachpb.InternalTimeSeriesData
        metaSubset enginepb.MVCCMetadataSubsetForMergeSerialization

        // !!NOTE!! If any new fields are added to this struct, returnToPool needs to
        // be updated accordingly.
}

var _ pebble.ValueMerger = (*MVCCValueMerger)(nil)

const (
        mvccChecksumSize = 4
        mvccTagPos       = mvccChecksumSize
        mvccHeaderSize   = mvccChecksumSize + 1
)

var mvccValueMergerPool = sync.Pool{
        New: func() any {
                return &MVCCValueMerger{}
        },
}

func NewMVCCValueMerger() *MVCCValueMerger {
        t := mvccValueMergerPool.Get().(*MVCCValueMerger)
        t.inPool = false
        return t
}

func (t *MVCCValueMerger) returnToPool() {
        for i := range t.timeSeriesOps {
                t.timeSeriesOps[i].ResetRetainingSlices()
        }
        t.timeSeriesOps = t.timeSeriesOps[:0]
        t.rawByteOps = t.rawByteOps[:0]
        t.oldestMergeTS = hlc.LegacyTimestamp{}
        t.oldToNew = false

        t.merged.ResetRetainingSlices()
        rawBytesSlice := t.metaSubset.RawBytes[:0]
        t.metaSubset = enginepb.MVCCMetadataSubsetForMergeSerialization{
                RawBytes: rawBytesSlice,
        }
        t.resetMeta()
        t.inPool = true
        mvccValueMergerPool.Put(t)
}

func (t *MVCCValueMerger) ensureOrder(oldToNew bool) {
        if oldToNew == t.oldToNew {
                return
        }
        // Only one of the two Reverse calls should actually do something under
        // error-free conditions, i.e., all operands are either timeseries or all are
        // non-timeseries.
        slices.Reverse(t.timeSeriesOps)
        slices.Reverse(t.rawByteOps)
        t.oldToNew = oldToNew
}

func (t *MVCCValueMerger) resetMeta() {
        rawBytes := t.meta.RawBytes[:0]
        t.meta = enginepb.MVCCMetadata{RawBytes: rawBytes}
}

func (t *MVCCValueMerger) deserializeMVCCValueAndAppend(value []byte) error {
        t.resetMeta()
        if err := t.meta.Unmarshal(value); err != nil { // nolint:protounmarshal
                return errors.Wrap(err, "corrupted operand value")
        }
        if len(t.meta.RawBytes) < mvccHeaderSize {
                return errors.Errorf("operand value too short")
        }
        if t.meta.RawBytes[mvccTagPos] == byte(roachpb.ValueType_TIMESERIES) {
                if len(t.rawByteOps) > 0 {
                        return errors.Errorf("inconsistent value types for timeseries merge")
                }
                if cap(t.timeSeriesOps) > len(t.timeSeriesOps) {
                        // Reuse any previous slices inside the InternalTimeSeriesData.
                        t.timeSeriesOps = t.timeSeriesOps[:len(t.timeSeriesOps)+1]
                } else {
                        t.timeSeriesOps = append(t.timeSeriesOps, roachpb.InternalTimeSeriesData{})
                }
                ts := &t.timeSeriesOps[len(t.timeSeriesOps)-1]
                // We want to reuse existing slices so we cannot use protoutil.Unmarshal().
                if err := ts.Unmarshal(t.meta.RawBytes[mvccHeaderSize:]); err != nil { // nolint:protounmarshal
                        return errors.Wrap(err, "corrupted timeseries")
                }
        } else {
                if len(t.timeSeriesOps) > 0 {
                        return errors.Errorf("inconsistent value types for non-timeseries merge")
                }
                // Append to rawByteOps, reusing any slice that was already there.
                n := len(t.rawByteOps)
                t.rawByteOps = slices.Grow(t.rawByteOps, 1)[:n+1]
                t.rawByteOps[n] = append(t.rawByteOps[n][:0], t.meta.RawBytes[mvccHeaderSize:]...)
        }
        // Save the timestamp of the oldest value since that is consistent with the
        // behavior of the C++ DBMergeOperator.
        if t.meta.MergeTimestamp != nil && (t.oldestMergeTS == hlc.LegacyTimestamp{} || !t.oldToNew) {
                t.oldestMergeTS = *t.meta.MergeTimestamp
        }
        return nil
}

// MergeNewer deserializes the value and appends it to the slice corresponding to its type
// (timeseries or non-timeseries). The slice will be reversed if needed such that it is in
// old-to-new order.
func (t *MVCCValueMerger) MergeNewer(value []byte) error {
        if t.inPool {
                return errors.AssertionFailedf("MVCCValueMerger used after being returned to pool")
        }
        t.ensureOrder(true /* oldToNew */)
        if err := t.deserializeMVCCValueAndAppend(value); err != nil {
                return err
        }
        return nil
}

// MergeOlder deserializes the value and appends it to the slice corresponding to its type
// (timeseries or non-timeseries). The slice will be reversed if needed such that it is in
// new-to-old order.
func (t *MVCCValueMerger) MergeOlder(value []byte) error {
        if t.inPool {
                return errors.AssertionFailedf("MVCCValueMerger used after being returned to pool")
        }
        t.ensureOrder(false /* oldToNew */)
        if err := t.deserializeMVCCValueAndAppend(value); err != nil {
                return err
        }
        return nil
}

// Finish combines the buffered values from all `Merge*()` calls and marshals the result.
// In case of non-timeseries the values are simply concatenated from old to new. In case
// of timeseries the values are sorted, deduplicated, and potentially migrated to columnar
// format. When deduplicating, only the latest sample for a given offset is retained.
func (t *MVCCValueMerger) Finish(includesBase bool) ([]byte, io.Closer, error) {
        if t.inPool {
                return nil, nil, errors.AssertionFailedf("MVCCValueMerger used after being returned to pool")
        }
        defer t.returnToPool()
        isColumnar := false
        if t.timeSeriesOps == nil && t.rawByteOps == nil {
                return nil, nil, errors.Errorf("empty merge unsupported")
        }
        t.ensureOrder(true /* oldToNew */)
        if t.timeSeriesOps == nil {
                // Concatenate non-timeseries operands from old to new
                totalLen := 0
                for _, rawByteOp := range t.rawByteOps {
                        totalLen += len(rawByteOp)
                }
                // See the motivating comment in mvcc.proto.
                meta := &t.metaSubset // avoid allocation
                meta.RawBytes = slices.Grow(meta.RawBytes[:0], mvccHeaderSize+totalLen)[:mvccHeaderSize]
                clear(meta.RawBytes)
                meta.RawBytes[mvccTagPos] = byte(roachpb.ValueType_BYTES)
                for _, rawByteOp := range t.rawByteOps {
                        meta.RawBytes = append(meta.RawBytes, rawByteOp...)
                }
                res, err := protoutil.Marshal(meta)
                if err != nil {
                        return nil, nil, err
                }
                return res, nil, nil
        }

        // TODO(ajkr): confirm it is the case that (1) today's CRDB always merges timeseries
        // values in columnar format, and (2) today's CRDB does not need to be downgrade-
        // compatible with any version that supports row format only. Then we can drop support
        // for row format entirely. It requires significant cleanup effort as many tests target
        // the row format.
        merged := &t.merged // avoid allocation, reuse slices.
        merged.StartTimestampNanos = t.timeSeriesOps[0].StartTimestampNanos
        merged.SampleDurationNanos = t.timeSeriesOps[0].SampleDurationNanos
        for i := range t.timeSeriesOps {
                timeSeriesOp := &t.timeSeriesOps[i]
                if timeSeriesOp.StartTimestampNanos != merged.StartTimestampNanos {
                        return nil, nil, errors.Errorf("start timestamp mismatch")
                }
                if timeSeriesOp.SampleDurationNanos != merged.SampleDurationNanos {
                        return nil, nil, errors.Errorf("sample duration mismatch")
                }
                if !isColumnar && len(timeSeriesOp.Offset) > 0 {
                        ensureColumnar(merged)
                        ensureColumnar(timeSeriesOp)
                        isColumnar = true
                } else if isColumnar {
                        ensureColumnar(timeSeriesOp)
                }
                proto.Merge(merged, timeSeriesOp)
        }
        if isColumnar {
                sortAndDeduplicateColumns(merged)
        } else {
                sortAndDeduplicateRows(merged)
        }

        meta := &t.metaSubset // avoid allocation
        rawBytesSize := mvccHeaderSize + merged.Size()
        meta.RawBytes = slices.Grow(meta.RawBytes[:0], rawBytesSize)[:rawBytesSize]
        meta.RawBytes[mvccTagPos] = byte(roachpb.ValueType_TIMESERIES)
        // See the motivating comment in mvcc.proto.
        if !(t.oldestMergeTS == hlc.LegacyTimestamp{}) {
                meta.MergeTimestamp = &t.oldestMergeTS
        }
        _, err := merged.MarshalToSizedBuffer(meta.RawBytes[mvccHeaderSize:])
        if err != nil {
                return nil, nil, err
        }
        res, err := protoutil.Marshal(meta)
        if err != nil {
                return nil, nil, err
        }
        return res, nil, nil
}

func serializeMergeInputs(sources ...roachpb.InternalTimeSeriesData) ([][]byte, error) {
        // Wrap each proto in an inlined MVCC value, and marshal each wrapped value
        // to bytes. This is the format required by the engine.
        srcBytes := make([][]byte, 0, len(sources))
        var val roachpb.Value
        for _, src := range sources {
                if err := val.SetProto(&src); err != nil {
                        return nil, err
                }
                bytes, err := protoutil.Marshal(&enginepb.MVCCMetadata{
                        RawBytes: val.RawBytes,
                })
                if err != nil {
                        return nil, err
                }
                srcBytes = append(srcBytes, bytes)
        }
        return srcBytes, nil
}

func deserializeMergeOutput(mergedBytes []byte) (roachpb.InternalTimeSeriesData, error) {
        // Unmarshal merged bytes and extract the time series value within.
        var meta enginepb.MVCCMetadata
        if err := protoutil.Unmarshal(mergedBytes, &meta); err != nil {
                return roachpb.InternalTimeSeriesData{}, err
        }
        mergedTS, err := MakeValue(meta).GetTimeseries()
        if err != nil {
                return roachpb.InternalTimeSeriesData{}, err
        }
        return mergedTS, nil
}

// MergeInternalTimeSeriesData exports the engine's MVCC merge logic for
// InternalTimeSeriesData to higher level packages. This is intended primarily
// for consumption by high level testing of time series functionality.
// If usePartialMerge is true, the operands are merged together using a partial
// merge operation first, and are then merged in to the initial state.
func MergeInternalTimeSeriesData(
        usePartialMerge bool, sources ...roachpb.InternalTimeSeriesData,
) (roachpb.InternalTimeSeriesData, error) {
        // Merge every element into a nil byte slice, one at a time.
        var mvccMerger MVCCValueMerger
        srcBytes, err := serializeMergeInputs(sources...)
        if err != nil {
                return roachpb.InternalTimeSeriesData{}, err
        }
        for _, bytes := range srcBytes {
                if err := mvccMerger.MergeNewer(bytes); err != nil {
                        return roachpb.InternalTimeSeriesData{}, err
                }
        }
        resBytes, closer, err := mvccMerger.Finish(!usePartialMerge)
        if err != nil {
                return roachpb.InternalTimeSeriesData{}, err
        }
        res, err := deserializeMergeOutput(resBytes)
        if closer != nil {
                _ = closer.Close()
        }
        return res, err
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"
        "encoding/binary"
        "sort"
        "sync"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvpb"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/uncertainty"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/util"
        "github.com/cockroachdb/cockroach/pkg/util/buildutil"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/metamorphic"
        "github.com/cockroachdb/cockroach/pkg/util/mon"
        "github.com/cockroachdb/cockroach/pkg/util/protoutil"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
)

var maxItersBeforeSeek = metamorphic.ConstantWithTestRange(
        "mvcc-max-iters-before-seek",
        10, /* defaultValue */
        0,  /* min */
        3,  /* max */
)

// MVCCDecodingStrategy controls if and how the fetcher should decode MVCC
// timestamps from returned KV's.
type MVCCDecodingStrategy int

const (
        // MVCCDecodingNotRequired is used when timestamps aren't needed.
        MVCCDecodingNotRequired MVCCDecodingStrategy = iota
        // MVCCDecodingRequired is used when timestamps are needed.
        MVCCDecodingRequired
)

// results abstracts away a result set where pebbleMVCCScanner put()'s KVs into.
type results interface {
        // clear clears the results so that its memory could be GCed.
        clear()
        // sizeInfo returns several pieces of information about the current size of
        // the results:
        // - the number of KVs currently in the results,
        // - the current memory footprint of the results in bytes,
        // - the increment for how much the memory footprint of the results will
        //   increase (in bytes) if a KV (with the corresponding lengths of the key
        //   and the value parts) is put into it.
        //
        // Note that we chose to squash all these things into a single method rather
        // than defining a separate method for each parameter out of performance
        // considerations.
        sizeInfo(lenKey, lenValue int) (numKeys, numBytes, numBytesInc int64)
        // put adds a KV into the results. An error is returned if the memory
        // reservation is denied by the memory account.
        put(_ context.Context, mvccKey []byte, value []byte, memAccount *mon.BoundAccount, maxNewSize int) error
        // continuesFirstRow returns true if the given key belongs to the same SQL
        // row as the first KV pair in the result. If the given key is not a valid
        // SQL row key, returns false.
        //
        // This method is called _after_ having called put() with no error at least
        // once, meaning that at least one key is in the results.
        //
        // Only called when wholeRows option is enabled.
        continuesFirstRow(key roachpb.Key) bool
        // maybeTrimPartialLastRow removes the last KV pairs from the result that
        // are part of the same SQL row as the given key, returning the earliest key
        // removed.
        //
        // pebbleMVCCScanner.getOne can call this method only _before_ calling
        // put(). This constraint ensures that for the singleResults implementation
        // of this interface, when this method is called, there is no buffered KV
        // (i.e. there is no KV that has been `put` into the results but not yet
        // returned on the NextKVer.NextKV call). This allows for the singleResults
        // to synchronize with the colfetcher.cFetcher (via the
        // storage.FirstKeyOfRowGetter) to obtain the first key of the SQL row if
        // the given key belongs to that row.
        //
        // Only called when wholeRows option is enabled.
        maybeTrimPartialLastRow(key roachpb.Key) (roachpb.Key, error)
        // lastRowHasFinalColumnFamily returns true if the last key in the result is
        // the maximum column family ID of the row. If so, we know that the row is
        // complete. However, the inverse is not true: the final column families of
        // the row may be omitted, in which case the caller has to scan to the next
        // key to find out whether the row is complete.
        //
        // This method is called _after_ having called put() with no error at least
        // once, meaning that at least one key is in the results.
        //
        // Only called when wholeRows option is enabled.
        lastRowHasFinalColumnFamily(reverse bool) bool
}

// Struct to store MVCCScan / MVCCGet in the same binary format as that
// expected by MVCCScanDecodeKeyValue.
type pebbleResults struct {
        count int64
        bytes int64
        repr  []byte
        bufs  [][]byte

        // lastOffsets is a ring buffer that keeps track of byte offsets for the last
        // N KV pairs. It is used to discard a partial SQL row at the end of the
        // result via maybeTrimPartialLastRows() -- such rows can span multiple KV
        // pairs. The length of lastOffsets is interpreted as the maximum expected SQL
        // row size (i.e.  number of column families).
        //
        // lastOffsets is initialized with a fixed length giving the N number of last
        // KV pair offsets to track. lastOffsetIdx contains the index in lastOffsets
        // where the next KV byte offset will be written, wrapping around to 0 when it
        // reaches the end of lastOffsets.
        //
        // The lastOffsets values are byte offsets in p.repr and p.bufs. The latest
        // lastOffset (i.e. the one at lastOffsetIdx-1) will be an offset in p.repr.
        // When iterating backwards through the ring buffer and crossing a byte offset
        // of 0, the next iterated byte offset in the ring buffer (at i-1) will then
        // point to the previous buffer in p.bufs.
        //
        // Actual and default 0 values in the slice are disambiguated when iterating
        // backwards through p.repr and p.bufs. If we iterate to the start of all byte
        // buffers without iterating through all of lastOffsets (i.e. when there are
        // fewer KV pairs than the length of lastOffsets), then we must be at the start
        // of lastOffsets, and any 0 values at the end are of no interest.
        lastOffsetsEnabled bool // NB: significantly faster than checking lastOffsets != nil
        lastOffsets        []int
        lastOffsetIdx      int
}

// clear implements the results interface.
func (p *pebbleResults) clear() {
        *p = pebbleResults{}
}

// Key value lengths take up 8 bytes (2 x Uint32).
const pebbleResultsKVLenSize = 8

func pebbleResultsKVSizeOf(lenKey, lenValue int) int {
        return pebbleResultsKVLenSize + lenKey + lenValue
}

// sizeInfo implements the results interface.
func (p *pebbleResults) sizeInfo(lenKey, lenValue int) (numKeys, numBytes, numBytesInc int64) {
        numKeys = p.count
        numBytes = p.bytes
        numBytesInc = int64(pebbleResultsKVSizeOf(lenKey, lenValue))
        return numKeys, numBytes, numBytesInc
}

// put implements the results interface.
//
// The repr that MVCCScan / MVCCGet expects to provide as output goes:
// <valueLen:Uint32><keyLen:Uint32><Key><Value>
// This function adds to repr in that format.
// - maxNewSize, if positive, indicates the maximum capacity for a new repr that
// can be allocated. It is assumed that maxNewSize (when positive) is sufficient
// for the new key-value pair.
func (p *pebbleResults) put(
        ctx context.Context, key []byte, value []byte, memAccount *mon.BoundAccount, maxNewSize int,
) error {
        const minSize = 16
        const maxSize = 128 << 20 // 128 MB

        // We maintain a list of buffers, always encoding into the last one (a.k.a.
        // pebbleResults.repr). The size of the buffers is exponentially increasing,
        // capped at maxSize. The exponential increase allows us to amortize the
        // cost of the allocation over multiple put calls. If this (key, value) pair
        // needs capacity greater than maxSize, we allocate exactly the size needed.
        lenKey := len(key)
        lenValue := len(value)
        lenToAdd := pebbleResultsKVSizeOf(lenKey, lenValue)
        if len(p.repr)+lenToAdd > cap(p.repr) {
                // Exponential increase by default, while ensuring that we respect
                // - a hard lower bound of lenToAdd
                // - a soft upper bound of maxSize
                // - a hard upper bound of maxNewSize (if set).
                if maxNewSize > 0 && maxNewSize < lenToAdd {
                        // Hard upper bound is greater than hard lower bound - this is a
                        // violation of our assumptions.
                        return errors.AssertionFailedf("maxNewSize %dB is not sufficient, %dB required", maxNewSize, lenToAdd)
                }
                // Exponential growth to ensure newSize >= lenToAdd.
                newSize := 2 * cap(p.repr)
                if newSize == 0 || newSize > maxSize {
                        // If the previous buffer exceeded maxSize, we don't double its
                        // capacity for next allocation, and instead reset the exponential
                        // increase, in case we had a stray huge key-value.
                        newSize = minSize
                }
                for newSize < lenToAdd {
                        newSize *= 2
                }
                // Respect soft upper-bound before hard lower-bound, since it could be
                // lower than hard lower-bound.
                if newSize > maxSize {
                        newSize = maxSize
                }
                // Respect hard upper-bound.
                if maxNewSize > 0 && newSize > maxNewSize {
                        newSize = maxNewSize
                }
                // Now respect hard lower-bound.
                if newSize < lenToAdd {
                        newSize = lenToAdd
                }
                if len(p.repr) > 0 {
                        p.bufs = append(p.bufs, p.repr)
                }
                if err := memAccount.Grow(ctx, int64(newSize)); err != nil {
                        return err
                }
                p.repr = nonZeroingMakeByteSlice(newSize)[:0]
        }

        startIdx := len(p.repr)
        p.repr = p.repr[:startIdx+lenToAdd]
        binary.LittleEndian.PutUint32(p.repr[startIdx:], uint32(lenValue))
        binary.LittleEndian.PutUint32(p.repr[startIdx+4:], uint32(lenKey))
        copy(p.repr[startIdx+pebbleResultsKVLenSize:], key)
        copy(p.repr[startIdx+pebbleResultsKVLenSize+lenKey:], value)
        p.count++
        p.bytes += int64(lenToAdd)

        // If we're tracking KV offsets, update the ring buffer.
        if p.lastOffsetsEnabled {
                p.lastOffsets[p.lastOffsetIdx] = startIdx
                p.lastOffsetIdx++
                // NB: Branching is significantly faster than modulo in benchmarks, likely
                // because of a high branch prediction hit rate.
                if p.lastOffsetIdx == len(p.lastOffsets) {
                        p.lastOffsetIdx = 0
                }
        }

        return nil
}

// continuesFirstRow implements the results interface.
func (p *pebbleResults) continuesFirstRow(key roachpb.Key) bool {
        repr := p.repr
        if len(p.bufs) > 0 {
                repr = p.bufs[0]
        }
        if len(repr) == 0 {
                return true // no rows in the result
        }

        rowPrefix := getRowPrefix(key)
        if rowPrefix == nil {
                return false
        }
        return bytes.Equal(rowPrefix, getRowPrefix(extractResultKey(repr)))
}

// keyHasFinalColumnFamily returns whether the given key corresponds to the last
// column family in a SQL row. Returns false if the key is not a valid SQL key.
func keyHasFinalColumnFamily(key roachpb.Key, maxFamilyID uint32, reverse bool) bool {
        colFamilyID, err := keys.DecodeFamilyKey(key)
        if err != nil {
                return false
        }
        if reverse {
                return colFamilyID == 0
        }
        return colFamilyID == maxFamilyID
}

// lastRowHasFinalColumnFamily implements the results interface.
func (p *pebbleResults) lastRowHasFinalColumnFamily(reverse bool) bool {
        lastOffsetIdx := p.lastOffsetIdx - 1 // p.lastOffsetIdx is where next offset would be stored
        if lastOffsetIdx < 0 {
                lastOffsetIdx = len(p.lastOffsets) - 1
        }
        lastOffset := p.lastOffsets[lastOffsetIdx]
        key := extractResultKey(p.repr[lastOffset:])
        return keyHasFinalColumnFamily(key, uint32(len(p.lastOffsets)-1), reverse)
}

// maybeTrimPartialLastRow implements the results interface.
//
// The row cannot be made up of more KV pairs than given by len(lastOffsets),
// otherwise an error is returned. Must be called before finish().
func (p *pebbleResults) maybeTrimPartialLastRow(nextKey roachpb.Key) (roachpb.Key, error) {
        if !p.lastOffsetsEnabled || len(p.repr) == 0 {
                return nil, nil
        }
        trimRowPrefix := getRowPrefix(nextKey)
        if trimRowPrefix == nil {
                return nil, nil
        }

        var firstTrimmedKey roachpb.Key

        // We're iterating backwards through the p.lastOffsets ring buffer, starting
        // at p.lastOffsetIdx-1 (which is where the last KV was stored). The loop
        // condition simply makes sure we limit the number of iterations to the size
        // of the ring buffer, to prevent wrapping around.
        for i := 0; i < len(p.lastOffsets); i++ {
                lastOffsetIdx := p.lastOffsetIdx - 1 // p.lastOffsetIdx is where next offset would be stored
                if lastOffsetIdx < 0 {
                        lastOffsetIdx = len(p.lastOffsets) - 1
                }
                lastOffset := p.lastOffsets[lastOffsetIdx]

                // The remainder of repr from the offset is now a single KV.
                repr := p.repr[lastOffset:]
                key := extractResultKey(repr)
                rowPrefix := getRowPrefix(key)

                // If the prefix belongs to a different row, we're done trimming.
                if !bytes.Equal(rowPrefix, trimRowPrefix) {
                        return firstTrimmedKey, nil
                }

                // Remove this KV pair.
                p.repr = p.repr[:lastOffset]
                p.count--
                p.bytes -= int64(len(repr))
                firstTrimmedKey = key

                p.lastOffsetIdx = lastOffsetIdx
                p.lastOffsets[lastOffsetIdx] = 0

                if len(p.repr) == 0 {
                        if len(p.bufs) == 0 {
                                // The entire result set was trimmed, so we're done.
                                return firstTrimmedKey, nil
                        }
                        // Pop the last buf back into repr.
                        p.repr = p.bufs[len(p.bufs)-1]
                        p.bufs = p.bufs[:len(p.bufs)-1]
                }
        }
        return nil, errors.Errorf("row exceeds expected max size (%d): %s", len(p.lastOffsets), nextKey)
}

func (p *pebbleResults) finish() [][]byte {
        if len(p.repr) > 0 {
                p.bufs = append(p.bufs, p.repr)
                p.repr = nil
        }
        return p.bufs
}

// getRowPrefix decodes a SQL row prefix from the given key. Returns nil if the
// key is not a valid SQL row, or if the prefix is the entire key.
func getRowPrefix(key roachpb.Key) []byte {
        if len(key) == 0 {
                return nil
        }
        n, err := keys.GetRowPrefixLength(key)
        if err != nil || n <= 0 || n >= len(key) {
                return nil
        }
        return key[:n]
}

// extractResultKey takes in a binary KV result representation, finds the raw
// key, decodes it as an MVCC key, and returns the key (without timestamp).
// Returns nil if the key could not be decoded. repr must be a valid, non-empty
// KV representation, otherwise this may panic.
func extractResultKey(repr []byte) roachpb.Key {
        keyLen := binary.LittleEndian.Uint32(repr[4:8])
        key, ok := DecodeEngineKey(repr[8 : 8+keyLen])
        if !ok {
                return nil
        }
        return key.Key
}

// pebbleMVCCScanner handles MVCCScan / MVCCGet using a Pebble iterator.
type pebbleMVCCScanner struct {
        parent MVCCIterator
        // memAccount is used to account for the size of the scan results.
        memAccount *mon.BoundAccount
        // unlimitedMemAcc will back the memAccount field above when the scanner is
        // retrieved from its pool. The account is cleared as the scanner returns to
        // the pool (see release); it's fine to "leak" the account if the scanner is
        // not returned to the pool, since it's an unlimited account.
        // When a custom mem account should be used instead, memAccount should be
        // overridden.
        unlimitedMemAcc mon.BoundAccount
        // lockTable is used to determine whether keys are locked in the in-memory
        // lock table when scanning with the skipLocked option.
        lockTable LockTableView
        reverse   bool
        peeked    bool
        // Iteration bounds. Does not contain MVCC timestamp.
        start, end roachpb.Key
        // Timestamp with which MVCCScan/MVCCGet was called.
        ts hlc.Timestamp
        // Max number of keys to return.
        maxKeys int64
        // Stop adding keys once p.result.bytes matches or exceeds this threshold,
        // if nonzero.
        targetBytes int64
        // If true, return an empty result if the first result exceeds targetBytes.
        allowEmpty bool
        // If set, don't return partial SQL rows (spanning multiple KV pairs) when
        // hitting a limit. Partial rows at the end of the result will be trimmed. If
        // allowEmpty is false, and the partial row is the first row in the result,
        // the row will instead be completed by fetching additional KV pairs.
        wholeRows bool
        // decodeMVCCHeaders is set by callers who expect to be able
        // to read the full MVCCValueHeader off of
        // curUnsafeValue. Used by mvccGet.
        decodeMVCCHeaders bool
        // Stop adding intents and abort scan once maxLockConflicts threshold is
        // reached. This limit is only applicable to consistent scans since they
        // return intents as an error.
        // Not used in inconsistent scans.
        // Ignored if zero.
        maxLockConflicts int64
        // Resume fields describe the resume span to return. resumeReason must be set
        // to a non-zero value to return a resume span, the others are optional.
        resumeReason    kvpb.ResumeReason
        resumeKey       roachpb.Key // if unset, falls back to p.advanceKey()
        resumeNextBytes int64       // set when targetBytes is exceeded
        // Transaction epoch and sequence number.
        txn               *roachpb.Transaction
        txnEpoch          enginepb.TxnEpoch
        txnSequence       enginepb.TxnSeq
        txnIgnoredSeqNums []enginepb.IgnoredSeqNumRange
        // Uncertainty related fields.
        uncertainty      uncertainty.Interval
        checkUncertainty bool
        // Metadata object for unmarshalling intents.
        meta enginepb.MVCCMetadata
        // Bools copied over from MVCC{Scan,Get}Options. See the comment on the
        // package level MVCCScan for what these mean.
        inconsistent bool
        skipLocked   bool
        tombstones   bool
        // rawMVCCValues instructs the scanner to return the full
        // extended encoding of any returned value. This includes the
        // MVCCValueHeader.
        rawMVCCValues    bool
        failOnMoreRecent bool
        keyBuf           []byte
        savedBuf         []byte
        lazyFetcherBuf   pebble.LazyFetcher
        lazyValueBuf     []byte
        // cur* variables store the "current" record we're pointing to. Updated in
        // updateCurrent. Note that the timestamp can be clobbered in the case of
        // adding an intent from the intent history but is otherwise meaningful.
        curUnsafeKey      MVCCKey
        curRawKey         []byte
        curUnsafeValue    MVCCValue
        curRawValue       pebble.LazyValue
        curRangeKeys      MVCCRangeKeyStack
        savedRangeKeys    MVCCRangeKeyStack
        savedRangeKeyVers MVCCRangeKeyVersion
        results           results
        intents           pebble.Batch
        // mostRecentTS stores the largest timestamp observed that is equal to or
        // above the scan timestamp. Only applicable if failOnMoreRecent is true. If
        // set and no other error is hit, a WriteToOld error will be returned from
        // the scan. mostRecentKey is one of the keys (not necessarily at
        // mostRecentTS) that was more recent than the scan.
        mostRecentTS  hlc.Timestamp
        mostRecentKey roachpb.Key
        // Stores any error returned. If non-nil, iteration short circuits.
        err error
        // Number of iterations to try before we do a Seek/SeekReverse. Stays within
        // [0, maxItersBeforeSeek] and defaults to maxItersBeforeSeek/2.
        itersBeforeSeek int
        // machine is the state machine for how the iterator should be advanced in
        // order to handle scans and reverse scans.
        machine struct {
                // fn indicates the advance function that needs to be called next.
                fn advanceFn
                // origKey is a temporary buffer used to store the "original key" when
                // advancing the iterator at the new key. It is backed by keyBuf.
                origKey []byte
        }
        // alloc holds fields embedded within the scanner struct only to reduce
        // allocations in common cases.
        alloc struct {
                // Typically pebbleMVCCScanner.results points to pebbleResults.
                // Embedding the pebbleResults within the pebbleMVCCScanner avoids an
                // extra allocation, at the cost of higher allocated bytes when we use a
                // different implementation of the results interface.
                pebbleResults pebbleResults
        }
}

type advanceFn int

const (
        _ advanceFn = iota

        // "Forward" advance states, used for non-reverse scans.
        //
        // advanceKeyForward indicates that the iterator needs to be advanced to the
        // next key.
        advanceKeyForward
        // advanceKeyAtEndForward indicates that the iterator has reached the end in
        // the forward direction.
        advanceKeyAtEndForward
        // advanceKeyAtNewKeyForward indicates that the iterator has just reached a
        // new key.
        advanceKeyAtNewKeyForward

        // "Reverse" advance states, used for reverse scans.
        //
        // advanceKeyReverse indicates that the iterator needs to be advanced to the
        // previous key.
        advanceKeyReverse
        // advanceKeyAtEndReverse indicates that the iterator has reached the end in
        // the reverse direction.
        advanceKeyAtEndReverse
        // advanceKeyAtNewKeyReverse indicates that the iterator needs to be
        // advanced to the key before the key that has just been reached.
        advanceKeyAtNewKeyReverse
)

// Pool for allocating pebble MVCC Scanners.
var pebbleMVCCScannerPool = sync.Pool{
        New: func() interface{} {
                mvccScanner := &pebbleMVCCScanner{
                        unlimitedMemAcc: *mon.NewStandaloneUnlimitedAccount(),
                }
                mvccScanner.memAccount = &mvccScanner.unlimitedMemAcc
                return mvccScanner
        },
}

func (p *pebbleMVCCScanner) release() {
        // Release all bytes from the unlimited memory account (but keep
        // the account intact).
        p.unlimitedMemAcc.Empty(context.Background())
        // Discard most memory references before placing in pool.
        *p = pebbleMVCCScanner{
                keyBuf:          p.keyBuf,
                memAccount:      &p.unlimitedMemAcc,
                unlimitedMemAcc: p.unlimitedMemAcc,
                // NB: This clears p.alloc.pebbleResults too, which should be maintained
                // to avoid delaying GC of contained byte slices and avoid accidental
                // misuse.
        }
        pebbleMVCCScannerPool.Put(p)
}

// init sets bounds on the underlying pebble iterator, and initializes other
// fields not set by the calling method.
func (p *pebbleMVCCScanner) init(
        txn *roachpb.Transaction, ui uncertainty.Interval, results results,
) {
        p.itersBeforeSeek = maxItersBeforeSeek / 2
        p.results = results

        if txn != nil {
                p.txn = txn
                p.txnEpoch = txn.Epoch
                p.txnSequence = txn.Sequence
                p.txnIgnoredSeqNums = txn.IgnoredSeqNums
        }

        p.uncertainty = ui
        // We must check uncertainty even if p.ts >= local_uncertainty_limit
        // because the local uncertainty limit cannot be applied to values with
        // future-time timestamps with earlier local timestamps. We are only able
        // to skip uncertainty checks if p.ts >= global_uncertainty_limit.
        //
        // We disable checkUncertainty when the scanner is configured with failOnMoreRecent.
        // This avoids cases in which a scan would have failed with a WriteTooOldError
        // but instead gets an unexpected ReadWithinUncertaintyIntervalError
        // See:
        // https://github.com/cockroachdb/cockroach/issues/119681
        p.checkUncertainty = p.ts.Less(p.uncertainty.GlobalLimit) && !p.failOnMoreRecent
}

// get seeks to the start key exactly once and adds one KV to the result set.
func (p *pebbleMVCCScanner) get(ctx context.Context) {
        p.parent.SeekGE(MVCCKey{Key: p.start})
        if !p.iterValid() {
                return
        }

        // Unlike scans, if tombstones are enabled, we synthesize point tombstones
        // for MVCC range tombstones even if there is no existing point key below
        // it. These are often needed for e.g. conflict checks. However, both
        // processRangeKeys and getOne may need to advance the iterator,
        // moving away from range key we originally landed on. If we're in tombstone
        // mode and there's a range key, save the most recent visible value so that
        // we can use it to synthesize a tombstone if we fail to find a KV.
        var hadMVCCRangeTombstone bool
        if p.tombstones {
                if _, hasRange := p.parent.HasPointAndRange(); hasRange {
                        rangeKeys := p.parent.RangeKeys()
                        if rkv, ok := rangeKeys.FirstAtOrBelow(p.ts); ok {
                                hadMVCCRangeTombstone = true
                                rkv.CloneInto(&p.savedRangeKeyVers)
                        }
                }
        }

        var added bool
        if p.processRangeKeys(true /* seeked */, false /* reverse */) {
                if p.updateCurrent() {
                        _, added = p.getOne(ctx)
                }
        }
        p.maybeFailOnMoreRecent()

        // In tombstone mode, if there was no existing point key we may need to
        // synthesize a point tombstone if we saved a range key before
        // Unlike scans, if tombstones are enabled, we synthesize point tombstones
        // for MVCC range tombstones even if there is no existing point key below
        // it. These are often needed for e.g. conflict checks.
        if p.tombstones && hadMVCCRangeTombstone && !added && p.err == nil {
                p.addSynthetic(ctx, p.start, p.savedRangeKeyVers)
        }
}

// seekToStartOfScan positions the scanner at the initial key.
func (p *pebbleMVCCScanner) seekToStartOfScan() (ok bool) {
        if p.reverse {
                if !p.iterSeekReverse(MVCCKey{Key: p.end}) {
                        p.maybeFailOnMoreRecent() // may have seen a conflicting range key
                        return false
                }
                p.machine.fn = advanceKeyReverse
        } else {
                if !p.iterSeek(MVCCKey{Key: p.start}) {
                        p.maybeFailOnMoreRecent() // may have seen a conflicting range key
                        return false
                }
                p.machine.fn = advanceKeyForward
        }
        return true
}

// advance advances the iterator according to the current state of the state
// machine.
func (p *pebbleMVCCScanner) advance() bool {
        switch p.machine.fn {
        case advanceKeyForward:
                return p.advanceKeyForward()
        case advanceKeyAtEndForward:
                // We've reached the end of the iterator and there is
                // nothing left to do.
                return false
        case advanceKeyAtNewKeyForward:
                // We're already at the new key so there is nothing to do.
                p.machine.fn = advanceKeyForward
                return true
        case advanceKeyReverse:
                return p.advanceKeyReverse()
        case advanceKeyAtEndReverse:
                p.machine.fn = advanceKeyReverse
                return p.advanceKeyAtEndReverse()
        case advanceKeyAtNewKeyReverse:
                p.machine.fn = advanceKeyReverse
                return p.advanceKeyAtNewKeyReverse()
        default:
                p.err = errors.AssertionFailedf("unexpected advanceFn: %d", p.machine.fn)
                return false
        }
}

// scan iterates until a limit is exceeded, the underlying iterator is
// exhausted, or an error is encountered. If a limit was exceeded, it returns a
// resume span, resume reason, and for targetBytes the size of the next result.
func (p *pebbleMVCCScanner) scan(
        ctx context.Context,
) (*roachpb.Span, kvpb.ResumeReason, int64, error) {
        if p.wholeRows && !p.results.(*pebbleResults).lastOffsetsEnabled {
                return nil, 0, 0, errors.AssertionFailedf("cannot use wholeRows without trackLastOffsets")
        }
        if !p.seekToStartOfScan() {
                return nil, 0, 0, p.err
        }
        for ok := true; ok; {
                ok, _ = p.getOne(ctx)
                if ok {
                        ok = p.advance()
                }
        }
        return p.afterScan()
}

// afterScan checks whether some limit was exceeded during the scan, and if so,
// it returns a resume span, resume reason, and for targetBytes the size of the
// next result.
func (p *pebbleMVCCScanner) afterScan() (*roachpb.Span, kvpb.ResumeReason, int64, error) {
        p.maybeFailOnMoreRecent()

        if p.err != nil {
                return nil, 0, 0, p.err
        }

        if p.resumeReason != 0 {
                resumeKey := p.resumeKey
                if len(resumeKey) == 0 {
                        if p.reverse {
                                if !p.advanceKeyReverse() {
                                        return nil, 0, 0, nil // nothing to resume
                                }
                        } else {
                                if !p.advanceKeyForward() {
                                        return nil, 0, 0, nil // nothing to resume
                                }
                        }
                        resumeKey = p.curUnsafeKey.Key
                }

                var resumeSpan *roachpb.Span
                if p.reverse {
                        // NB: this is equivalent to:
                        //  append(roachpb.Key(nil), resumeKey...).Next()
                        // but with half the allocations.
                        resumeKeyCopy := make(roachpb.Key, len(resumeKey), len(resumeKey)+1)
                        copy(resumeKeyCopy, resumeKey)
                        resumeSpan = &roachpb.Span{
                                Key:    p.start,
                                EndKey: resumeKeyCopy.Next(),
                        }
                } else {
                        resumeSpan = &roachpb.Span{
                                Key:    append(roachpb.Key(nil), resumeKey...),
                                EndKey: p.end,
                        }
                }
                return resumeSpan, p.resumeReason, p.resumeNextBytes, nil
        }
        return nil, 0, 0, nil
}

// Increments itersBeforeSeek while ensuring it stays <= maxItersBeforeSeek.
func (p *pebbleMVCCScanner) incrementItersBeforeSeek() {
        p.itersBeforeSeek++
        if p.itersBeforeSeek > maxItersBeforeSeek {
                p.itersBeforeSeek = maxItersBeforeSeek
        }
}

// Decrements itersBeforeSeek while ensuring it stays positive.
func (p *pebbleMVCCScanner) decrementItersBeforeSeek() {
        p.itersBeforeSeek--
        if p.itersBeforeSeek < 1 {
                if maxItersBeforeSeek > 0 {
                        p.itersBeforeSeek = 1
                } else if p.itersBeforeSeek < 0 {
                        // maxItersBeforeSeek == 0 && p.itersBeforeSeek < 0.
                        p.itersBeforeSeek = 0
                }
        }
}

// Try to read from the current value's intent history. Assumes p.meta has been
// unmarshalled already. Returns found = true if a value was found and returned.
func (p *pebbleMVCCScanner) getFromIntentHistory() (value []byte, found bool) {
        intentHistory := p.meta.IntentHistory
        // upIdx is the index of the first intent in intentHistory with a sequence
        // number greater than our transaction's sequence number. Subtract 1 from it
        // to get the index of the intent with the highest sequence number that is
        // still less than or equal to p.txnSeq.
        upIdx := sort.Search(len(intentHistory), func(i int) bool {
                return intentHistory[i].Sequence > p.txnSequence
        })
        // If the candidate intent has a sequence number that is ignored by this txn,
        // iterate backward along the sorted intent history until we come across an
        // intent which isn't ignored.
        //
        // TODO(itsbilal): Explore if this iteration can be improved through binary
        // search.
        for upIdx > 0 && enginepb.TxnSeqIsIgnored(p.meta.IntentHistory[upIdx-1].Sequence, p.txnIgnoredSeqNums) {
                upIdx--
        }
        if upIdx == 0 {
                // It is possible that no intent exists such that the sequence is less
                // than the read sequence, and is not ignored by this transaction.
                // In this case, we cannot read a value from the intent history.
                return nil, false
        }
        intent := &p.meta.IntentHistory[upIdx-1]
        return intent.Value, true
}

// Returns a write too old error if an error is not already set on the scanner
// and a more recent value was found during the scan.
func (p *pebbleMVCCScanner) maybeFailOnMoreRecent() {
        if p.err != nil || p.mostRecentTS.IsEmpty() {
                return
        }
        // The txn can't write at the existing timestamp, so we provide the error
        // with the timestamp immediately after it.
        p.err = kvpb.NewWriteTooOldError(p.ts, p.mostRecentTS.Next(), p.mostRecentKey)
        p.results.clear()
        p.intents.Reset()
}

// Returns an uncertainty error with the specified value and local timestamps,
// along with context about the reader.
func (p *pebbleMVCCScanner) uncertaintyError(
        valueTs hlc.Timestamp, localTs hlc.ClockTimestamp,
) (ok bool) {
        p.err = kvpb.NewReadWithinUncertaintyIntervalError(
                p.ts, p.uncertainty.LocalLimit, p.txn, valueTs, localTs)
        p.results.clear()
        p.intents.Reset()
        return false
}

// Get one tuple into the result set. This method will make at most one
// 'results.put' call regardless of whether 'put' returns an error or not.
// - ok indicates whether the iteration should continue.
// - added indicates whether a tuple was included into the result set.
// (ok=true, added=false) indicates that the current key was skipped for some
// reason, but the iteration should continue.
// (ok=false, added=true) indicates that the KV was included into the result but
// the iteration should stop.
//
// The scanner must be positioned on a point key, possibly with an overlapping
// range key. Range keys are processed separately in processRangeKeys().
func (p *pebbleMVCCScanner) getOne(ctx context.Context) (ok, added bool) {
        if !p.curUnsafeKey.Timestamp.IsEmpty() {
                // Range key where read ts >= range key ts >= point key ts. Synthesize a
                // point tombstone for it. Range key conflict checks are done in
                // processRangeKeys().
                if rkv, ok := p.coveredByRangeKey(p.curUnsafeKey.Timestamp); ok {
                        return p.addSynthetic(ctx, p.curUnsafeKey.Key, rkv)
                }

                // We are eagerly fetching and decoding the value, even though it may be
                // too recent. With some care, this could be optimized to be lazy.
                v, valid := p.getFromLazyValue()
                if !valid {
                        return false, false
                }

                uncertaintyCheckRequired := p.checkUncertainty && !p.curUnsafeKey.Timestamp.LessEq(p.ts)
                if !p.mvccHeaderRequired(uncertaintyCheckRequired) {
                        if !p.decodeCurrentValueIgnoringHeader(v) {
                                return false, false
                        }
                } else if extended, valid := p.tryDecodeCurrentValueSimple(v); !valid {
                        return false, false
                } else if extended {
                        if !p.decodeCurrentValueExtended(v) {
                                return false, false
                        }
                }

                // ts < read_ts
                if p.curUnsafeKey.Timestamp.Less(p.ts) {
                        // 1. Fast path: there is no intent and our read timestamp is newer
                        // than the most recent version's timestamp.
                        return p.add(ctx, p.curUnsafeKey.Key, p.curRawKey, p.curUnsafeValue.Value.RawBytes, v)
                }

                // ts == read_ts
                if p.curUnsafeKey.Timestamp == p.ts {
                        if p.failOnMoreRecent {
                                // 2. Our txn's read timestamp is equal to the most recent
                                // version's timestamp and the scanner has been configured to
                                // throw a write too old error on equal or more recent versions.

                                if p.skipLocked {
                                        if locked, ok := p.isKeyLockedByConflictingTxn(ctx, p.curRawKey); !ok {
                                                return false, false
                                        } else if locked {
                                                // 2a. the scanner was configured to skip locked keys, and
                                                // this key was locked, so we can advance past it without
                                                // raising the write too old error.
                                                return true /* ok */, false
                                        }
                                }

                                // 2b. We need to raise a write too old error. Merge the current
                                // timestamp with the maximum timestamp we've seen so we know to
                                // return an error, but then keep scanning so that we can return
                                // the largest possible time.
                                if p.mostRecentTS.Forward(p.curUnsafeKey.Timestamp) {
                                        p.mostRecentKey = append(p.mostRecentKey[:0], p.curUnsafeKey.Key...)
                                }
                                return true /* ok */, false
                        }

                        // 3. There is no intent and our read timestamp is equal to the most
                        // recent version's timestamp.
                        return p.add(ctx, p.curUnsafeKey.Key, p.curRawKey, p.curUnsafeValue.Value.RawBytes, v)
                }

                // ts > read_ts
                if p.failOnMoreRecent {
                        // 4. Our txn's read timestamp is less than the most recent
                        // version's timestamp and the scanner has been configured to
                        // throw a write too old error on equal or more recent versions.

                        if p.skipLocked {
                                if locked, ok := p.isKeyLockedByConflictingTxn(ctx, p.curRawKey); !ok {
                                        return false, false
                                } else if locked {
                                        // 4a. the scanner was configured to skip locked keys, and
                                        // this key was locked, so we can advance past it without
                                        // raising the write too old error.
                                        return true /* ok */, false
                                }
                        }

                        // 4b. We need to raise a write too old error. Merge the current
                        // timestamp with the maximum timestamp we've seen so we know to
                        // return an error, but then keep scanning so that we can return
                        // the largest possible time.
                        if p.mostRecentTS.Forward(p.curUnsafeKey.Timestamp) {
                                p.mostRecentKey = append(p.mostRecentKey[:0], p.curUnsafeKey.Key...)
                        }
                        return true /* ok */, false
                }

                if p.checkUncertainty {
                        // 5. Our txn's read timestamp is less than the max timestamp
                        // seen by the txn. We need to check for clock uncertainty
                        // errors.
                        localTS := p.curUnsafeValue.GetLocalTimestamp(p.curUnsafeKey.Timestamp)
                        if p.uncertainty.IsUncertain(p.curUnsafeKey.Timestamp, localTS) {
                                return p.uncertaintyError(p.curUnsafeKey.Timestamp, localTS), false
                        }

                        // This value is not within the reader's uncertainty window, but
                        // there could be other uncertain committed values, so seek and
                        // check uncertainty using the uncertainty interval's GlobalLimit.
                        return p.seekVersion(ctx, p.uncertainty.GlobalLimit, true)
                }

                // 6. Our txn's read timestamp is greater than or equal to the
                // max timestamp seen by the txn so clock uncertainty checks are
                // unnecessary. We need to seek to the desired version of the
                // value (i.e. one with a timestamp earlier than our read
                // timestamp).
                return p.seekVersion(ctx, p.ts, false)
        }

        if !p.decodeCurrentMetadata() {
                return false, false
        }
        if len(p.meta.RawBytes) != 0 {
                // 7. Emit immediately if the value is inline.
                //
                // TODO(ssd): We error if we find an inline when
                // ReturnRawMVCCValues is set. Anyone scanning with
                // that option set should not be encountering inline
                // values.
                //
                // https://github.com/cockroachdb/cockroach/issues/131667
                return p.add(ctx, p.curUnsafeKey.Key, p.curRawKey, p.meta.RawBytes, p.meta.RawBytes)
        }

        if p.meta.Txn == nil {
                p.err = errors.Errorf("intent without transaction")
                return false, false
        }
        metaTS := p.meta.Timestamp.ToTimestamp()

        // metaTS is the timestamp of an intent value, which we may or may
        // not end up ignoring, depending on factors codified below. If we do ignore
        // the intent then we want to read at a lower timestamp that's strictly
        // below the intent timestamp (to skip the intent), but also does not exceed
        // our read timestamp (to avoid erroneously picking up future committed
        // values); this timestamp is prevTS.
        prevTS := p.ts
        if metaTS.LessEq(p.ts) {
                prevTS = metaTS.Prev()
        }

        ownIntent := p.txn != nil && p.meta.Txn.ID.Equal(p.txn.ID)
        if !ownIntent {
                conflictingIntent := metaTS.LessEq(p.ts) || p.failOnMoreRecent
                if !conflictingIntent {
                        // 8. The key contains an intent, but we're reading below the intent.
                        // Seek to the desired version, checking for uncertainty if necessary.
                        //
                        // Note that if we own the intent (i.e. we're reading transactionally)
                        // we want to read the intent regardless of our read timestamp and fall
                        // into case 11 below.
                        if p.checkUncertainty {
                                // The intent's provisional value may be within the uncertainty window.
                                // Or there could be a different, uncertain committed value in the
                                // window. To detect either case, seek to and past the uncertainty
                                // interval's global limit and check uncertainty as we scan.
                                return p.seekVersion(ctx, p.uncertainty.GlobalLimit, true)
                        }
                        return p.seekVersion(ctx, p.ts, false)
                }

                if p.inconsistent {
                        // 9. The key contains an intent and we're doing an inconsistent
                        // read at a timestamp newer than the intent. We ignore the
                        // intent by insisting that the timestamp we're reading at is a
                        // historical timestamp < the intent timestamp. However, we
                        // return the intent separately; the caller may want to resolve
                        // it.
                        //
                        // p.intents is a pebble.Batch which grows its byte slice capacity in
                        // chunks to amortize allocations. The memMonitor is under-counting here
                        // by only accounting for the key and value bytes.
                        if !p.addCurIntent(ctx) {
                                return false, false
                        }
                        return p.seekVersion(ctx, prevTS, false)
                }

                if p.skipLocked {
                        // 10. The scanner has been configured with the skipLocked option. Ignore
                        // intents written by other transactions and seek to the next key.
                        // However, we return the intent separately if we have room; the caller
                        // may want to resolve it. Unlike below, this intent will not result in
                        // a LockConflictError because MVCC{Scan,Get}Options.errOnIntents returns
                        // false when skipLocked in enabled.
                        if p.maxLockConflicts == 0 || int64(p.intents.Count()) < p.maxLockConflicts {
                                if !p.addCurIntent(ctx) {
                                        return false, false
                                }
                        }
                        return true /* ok */, false
                }

                // 11. The key contains an intent which was not written by our
                // transaction and either:
                // - our read timestamp is equal to or newer than that of the
                //   intent
                // - our read timestamp is older than that of the intent but
                //   the intent is in our transaction's uncertainty interval
                // - our read timestamp is older than that of the intent but
                //   we want to fail on more recent writes
                // Note that this will trigger an error higher up the stack. We
                // continue scanning so that we can return all of the intents
                // in the scan range.
                if !p.addCurIntent(ctx) {
                        return false, false
                }
                // Limit number of intents returned in lock conflict error.
                if p.maxLockConflicts > 0 && int64(p.intents.Count()) >= p.maxLockConflicts {
                        p.resumeReason = kvpb.RESUME_INTENT_LIMIT
                        return false, false
                }
                return true /* ok */, false
        }

        if p.txnEpoch == p.meta.Txn.Epoch {
                if p.txnSequence >= p.meta.Txn.Sequence && !enginepb.TxnSeqIsIgnored(p.meta.Txn.Sequence, p.txnIgnoredSeqNums) {
                        // 12. We're reading our own txn's intent at an equal or higher sequence.
                        // Note that we read at the intent timestamp, not at our read timestamp
                        // as the intent timestamp may have been pushed forward by another
                        // transaction. Txn's always need to read their own writes.
                        return p.seekVersion(ctx, metaTS, false)
                }

                // 13. We're reading our own txn's intent at a lower sequence than is
                // currently present in the intent. This means the intent we're seeing
                // was written at a higher sequence than the read and that there may or
                // may not be earlier versions of the intent (with lower sequence
                // numbers) that we should read. If there exists a value in the intent
                // history that has a sequence number equal to or less than the read
                // sequence, read that value.
                if intentValueRaw, found := p.getFromIntentHistory(); found {
                        // If we're adding a value due to a previous intent, we want to populate
                        // the timestamp as of current metaTimestamp. Note that this may be
                        // controversial as this maybe be neither the write timestamp when this
                        // intent was written. However, this was the only case in which a value
                        // could have been returned from a read without an MVCC timestamp.
                        //
                        // Note: this assumes that it is safe to corrupt curKey here because we're
                        // about to advance. If this proves to be a problem later, we can extend
                        // add to take an MVCCKey explicitly.
                        p.curUnsafeKey.Timestamp = metaTS
                        p.keyBuf = EncodeMVCCKeyToBuf(p.keyBuf[:0], p.curUnsafeKey)
                        p.curUnsafeValue, p.err = DecodeMVCCValue(intentValueRaw)
                        if p.err != nil {
                                return false, false
                        }
                        return p.add(ctx, p.curUnsafeKey.Key, p.keyBuf, p.curUnsafeValue.Value.RawBytes, intentValueRaw)
                }
                // 14. If no value in the intent history has a sequence number equal to
                // or less than the read, we must ignore the intents laid down by the
                // transaction all together. We ignore the intent by insisting that the
                // timestamp we're reading at is a historical timestamp < the intent
                // timestamp.
                return p.seekVersion(ctx, prevTS, false)
        }

        if p.txnEpoch < p.meta.Txn.Epoch {
                // 15. We're reading our own txn's intent but the current txn has
                // an earlier epoch than the intent. Return an error so that the
                // earlier incarnation of our transaction aborts (presumably
                // this is some operation that was retried).
                p.err = errors.Errorf("failed to read with epoch %d due to a write intent with epoch %d",
                        p.txnEpoch, p.meta.Txn.Epoch)
                return false, false
        }

        // 16. We're reading our own txn's intent but the current txn has a
        // later epoch than the intent. This can happen if the txn was
        // restarted and an earlier iteration wrote the value we're now
        // reading. In this case, we ignore the intent and read the
        // previous value as if the transaction were starting fresh.
        return p.seekVersion(ctx, prevTS, false)
}

// nextKey advances to the next user key.
func (p *pebbleMVCCScanner) nextKey() bool {
        if p.reverse && p.peeked {
                // If the parent iterator is in reverse because we've peeked, then we
                // can step the iterator once to land back onto the current key before
                // we fallthrough to call NextKey.
                if !p.iterNext() {
                        return false
                }
                // Fallthrough to NextKey.
        }
        p.parent.NextKey()
        if !p.iterValid() {
                return false
        }
        if !p.processRangeKeys(false /* seeked */, false /* reverse */) {
                return false
        }
        return p.updateCurrent()
}

// backwardLatestVersion backs up the iterator to the latest version for the
// specified key. The parameter i is used to maintain iteration count between
// the loop here and the caller (usually prevKey). Returns false if the
// iterator was exhausted. Assumes that the iterator is currently positioned at
// the oldest version of key.
func (p *pebbleMVCCScanner) backwardLatestVersion(key []byte, i int) bool {
        p.keyBuf = append(p.keyBuf[:0], key...)

        for ; i < p.itersBeforeSeek; i++ {
                peekedKey, hasPoint, ok := p.iterPeekPrev()
                if !ok {
                        // No previous entry exists, so we're at the latest version of key.
                        return true
                }
                // We may peek a bare range key with the same start bound as the point key,
                // in which case we're also positioned on the latest point key version.
                if !bytes.Equal(peekedKey, p.keyBuf) || !hasPoint {
                        p.incrementItersBeforeSeek()
                        return true
                }
                if !p.iterPrev() {
                        return false
                }
        }

        // We're still not pointed to the latest version of the key. Fall back to
        // seeking to the latest version. Note that we cannot rely on key being
        // unchanged even though we are at a different version of the same key --
        // the underlying MVCCIterator is free to mutate the backing for key
        // arbitrarily. Therefore we use p.keyBuf here which we have handy.
        p.decrementItersBeforeSeek()
        return p.iterSeek(MVCCKey{Key: p.keyBuf})
}

// prevKey advances to the newest version of the user key preceding the
// specified key. Assumes that the iterator is currently positioned at key or 1
// record after key and that the key is "safe" (i.e. it's a copy and not the
// "current" key directly).
func (p *pebbleMVCCScanner) prevKey(key []byte) bool {
        for i := 0; i < p.itersBeforeSeek; i++ {
                if !p.iterPrev() {
                        return false
                }
                if !bytes.Equal(p.curUnsafeKey.Key, key) {
                        return p.backwardLatestVersion(p.curUnsafeKey.Key, i+1)
                }
        }

        p.decrementItersBeforeSeek()
        return p.iterSeekReverse(MVCCKey{Key: key})
}

// advanceKeyForward advances to the next key in the forward direction.
func (p *pebbleMVCCScanner) advanceKeyForward() bool {
        return p.nextKey()
}

// advanceKeyReverse advances to the next key in the reverse direction.
func (p *pebbleMVCCScanner) advanceKeyReverse() bool {
        // Make a copy to satisfy the contract of prevKey.
        p.keyBuf = append(p.keyBuf[:0], p.curUnsafeKey.Key...)
        return p.prevKey(p.keyBuf)
}

// setAdvanceKeyAtEnd updates the machine to the corresponding "advance key at
// end" state.
func (p *pebbleMVCCScanner) setAdvanceKeyAtEnd() {
        if p.reverse {
                p.machine.fn = advanceKeyAtEndReverse
        } else {
                p.machine.fn = advanceKeyAtEndForward
        }
}

// advanceKeyAtEndReverse advances to the next key when the iterator's end has
// been reached in the reverse direction.
func (p *pebbleMVCCScanner) advanceKeyAtEndReverse() bool {
        // Iterating to the next key might have caused the iterator to reach the
        // end of the key space. If that happens, back up to the very last key.
        p.peeked = false
        p.parent.SeekLT(MVCCKey{Key: p.end})
        if !p.iterValid() {
                return false
        }
        if !p.processRangeKeys(true /* seeked */, true /* reverse */) {
                return false
        }
        if !p.updateCurrent() {
                return false
        }
        return p.advanceKeyReverse()
}

// setAdvanceKeyAtNewKey updates the machine to the corresponding "advance key
// at new key" state.
func (p *pebbleMVCCScanner) setAdvanceKeyAtNewKey(origKey []byte) {
        p.machine.origKey = origKey
        if p.reverse {
                p.machine.fn = advanceKeyAtNewKeyReverse
        } else {
                p.machine.fn = advanceKeyAtNewKeyForward
        }
}

// advanceKeyAtNewKeyReverse advances to the key after the key stored in
// p.machine.origKey in the reverse direction, assuming we have just reached the
// key after p.machine.origKey in the forward direction.
func (p *pebbleMVCCScanner) advanceKeyAtNewKeyReverse() bool {
        // We've advanced to the next key but need to move back to the previous key.
        // Note that we already made the copy in seekVersion, so we can just use the
        // key as is.
        return p.prevKey(p.machine.origKey)
}

// IncludeStartKeyIntoErr wraps with the given error to include the provided
// start key of the scan as an additional detail.
func IncludeStartKeyIntoErr(startKey roachpb.Key, err error) error {
        return errors.Wrapf(err, "scan with start key %s", startKey)
}

// Adds the specified key and value to the result set, excluding
// tombstones unless p.tombstones is true. If p.rawMVCCValues is true,
// then the mvccRawBytes argument will be added to the results set
// instead.
//
//   - ok indicates whether the iteration should continue. This can be false
//     because we hit an error or reached some limit.
//   - added indicates whether the key and value were included into the result
//     set.
func (p *pebbleMVCCScanner) add(
        ctx context.Context, key roachpb.Key, rawKey []byte, rawValue []byte, mvccRawBytes []byte,
) (ok, added bool) {
        // Don't include deleted versions len(val) == 0, unless we've been instructed
        // to include tombstones in the results.
        if len(rawValue) == 0 && !p.tombstones {
                return true /* ok */, false
        }
        if p.rawMVCCValues {
                rawValue = mvccRawBytes
        }

        // If the scanner has been configured with the skipLocked option, don't
        // include locked keys in the result set. Consult the in-memory lock table to
        // determine whether this is locked with an unreplicated lock. Replicated
        // locks will be represented as intents, which will be skipped over in
        // getAndAdvance.
        if p.skipLocked {
                if locked, ok := p.isKeyLockedByConflictingTxn(ctx, rawKey); !ok {
                        return false, false
                } else if locked {
                        return true /* ok */, false
                }
        }

        numKeys, numBytes, numBytesInc := p.results.sizeInfo(len(rawKey), len(rawValue))

        // Check if adding the key would exceed a limit.
        if p.targetBytes > 0 && numBytes+numBytesInc > p.targetBytes {
                p.resumeReason = kvpb.RESUME_BYTE_LIMIT
                p.resumeNextBytes = numBytesInc

        } else if p.maxKeys > 0 && numKeys >= p.maxKeys {
                p.resumeReason = kvpb.RESUME_KEY_LIMIT
        }

        var mustPutKey bool
        if p.resumeReason != 0 {
                // If we exceeded a limit, but we're not allowed to return an empty result,
                // then make sure we include the first key in the result. If wholeRows is
                // enabled, then also make sure we complete the first SQL row.
                if !p.allowEmpty &&
                        (numKeys == 0 || (p.wholeRows && p.results.continuesFirstRow(key))) {
                        p.resumeReason = 0
                        p.resumeNextBytes = 0
                        mustPutKey = true
                } else {
                        p.resumeKey = key

                        // If requested, remove any partial SQL rows from the end of the result.
                        if p.wholeRows {
                                trimmedKey, err := p.results.maybeTrimPartialLastRow(key)
                                if err != nil {
                                        p.err = err
                                        return false, false
                                }
                                if trimmedKey != nil {
                                        p.resumeKey = trimmedKey
                                }
                        }
                        return false, false
                }
        }

        // We are here due to one of the following cases:
        // A. No limits were exceeded
        // B. Limits were exceeded, but we need to put a key, so mustPutKey = true.
        //
        // For B we will never set maxNewSize.
        // For A, we may set maxNewSize, but we already know that
        //   p.targetBytes >= numBytes + numBytesInc
        // so maxNewSize will be sufficient.
        var maxNewSize int
        if p.targetBytes > 0 && p.targetBytes > numBytes && !mustPutKey {
                // INVARIANT: !mustPutKey => maxNewSize is sufficient for key-value
                // pair.
                maxNewSize = int(p.targetBytes - numBytes)
        }
        if err := p.results.put(ctx, rawKey, rawValue, p.memAccount, maxNewSize); err != nil {
                p.err = IncludeStartKeyIntoErr(p.start, err)
                return false, false
        }
        numKeys++

        // Check if we hit the key limit just now to avoid scanning further before
        // checking the key limit above on the next iteration. This has a small cost
        // (~0.5% for large scans), but avoids the potentially large cost of scanning
        // lots of garbage before the next key -- especially when maxKeys is small.
        if p.maxKeys > 0 && numKeys >= p.maxKeys {
                // If we're not allowed to return partial SQL rows, check whether the last
                // KV pair in the result has the maximum column family ID of the row. If so,
                // we can return early. However, if it doesn't then we can't know yet
                // whether the row is complete or not, because the final column families of
                // the row may have been omitted (if they are all NULL values) -- to find
                // out, we must continue scanning to the next key and handle it above.
                if !p.wholeRows || p.results.lastRowHasFinalColumnFamily(p.reverse) {
                        p.resumeReason = kvpb.RESUME_KEY_LIMIT
                        return false /* ok */, true /* added */
                }
        }
        return true /* ok */, true /* added */
}

// addSynthetic adds a synthetic point key for the given range key version.
func (p *pebbleMVCCScanner) addSynthetic(
        ctx context.Context, key roachpb.Key, version MVCCRangeKeyVersion,
) (ok, added bool) {
        p.keyBuf = EncodeMVCCKeyToBuf(p.keyBuf[:0], MVCCKey{Key: key, Timestamp: version.Timestamp})
        var value MVCCValue
        var simple bool
        value, simple, p.err = tryDecodeSimpleMVCCValue(version.Value)
        if !simple && p.err == nil {
                value, p.err = decodeExtendedMVCCValue(version.Value, p.decodeMVCCHeaders)
        }
        if p.err != nil {
                return false, false
        }

        return p.add(ctx, key, p.keyBuf, value.Value.RawBytes, version.Value)
}

// Seeks to the latest revision of the current key that's still less than or
// equal to the specified timestamp and adds it to the result set.
//   - ok indicates whether the iteration should continue.
//   - added indicates whether the key and value were included into the result
//     set.
func (p *pebbleMVCCScanner) seekVersion(
        ctx context.Context, seekTS hlc.Timestamp, uncertaintyCheck bool,
) (ok, added bool) {
        if seekTS.IsEmpty() {
                // If the seek timestamp is empty, we've already seen all versions of this
                // key, so seek to the next key. Seeking to version zero of the current key
                // would be incorrect, as version zero is stored before all other versions.
                return true /* ok */, false
        }

        seekKey := MVCCKey{Key: p.curUnsafeKey.Key, Timestamp: seekTS}
        p.keyBuf = EncodeMVCCKeyToBuf(p.keyBuf[:0], seekKey)
        origKey := p.keyBuf[:len(p.curUnsafeKey.Key)]
        // We will need seekKey below, if the next's don't suffice. Even though the
        // MVCCIterator will be at a different version of the same key, it is free
        // to mutate the backing for p.curUnsafeKey.Key in an arbitrary manner. So
        // assign to this copy, to make it stable.
        seekKey.Key = origKey

        for i := 0; i < p.itersBeforeSeek; i++ {
                if !p.iterNext() {
                        p.setAdvanceKeyAtEnd()
                        return true /* ok */, false
                }
                if !bytes.Equal(p.curUnsafeKey.Key, origKey) {
                        p.incrementItersBeforeSeek()
                        p.setAdvanceKeyAtNewKey(origKey)
                        return true /* ok */, false
                }
                if p.curUnsafeKey.Timestamp.LessEq(seekTS) {
                        p.incrementItersBeforeSeek()
                        v, valid := p.getFromLazyValue()
                        if !valid {
                                return false, false
                        }
                        uncertaintyCheckRequired := uncertaintyCheck && !p.curUnsafeKey.Timestamp.LessEq(p.ts)
                        if !p.mvccHeaderRequired(uncertaintyCheckRequired) {
                                if !p.decodeCurrentValueIgnoringHeader(v) {
                                        return false, false
                                }
                        } else if extended, valid := p.tryDecodeCurrentValueSimple(v); !valid {
                                return false, false
                        } else if extended {
                                if !p.decodeCurrentValueExtended(v) {
                                        return false, false
                                }
                        }
                        if !uncertaintyCheckRequired {
                                if rkv, ok := p.coveredByRangeKey(p.curUnsafeKey.Timestamp); ok {
                                        return p.addSynthetic(ctx, p.curUnsafeKey.Key, rkv)
                                }
                                return p.add(ctx, p.curUnsafeKey.Key, p.curRawKey, p.curUnsafeValue.Value.RawBytes, v)
                        }
                        // Iterate through uncertainty interval. Though we found a value in
                        // the interval, it may not be uncertainty. This is because seekTS
                        // is set to the transaction's global uncertainty limit, so we are
                        // seeking based on the worst-case uncertainty, but values with a
                        // time in the range (uncertainty.LocalLimit, uncertainty.GlobalLimit]
                        // are only uncertain if they have an earlier local timestamp that is
                        // before uncertainty.LocalLimit. Meanwhile, any value with a time in
                        // the range (ts, uncertainty.LocalLimit] is uncertain.
                        localTS := p.curUnsafeValue.GetLocalTimestamp(p.curUnsafeKey.Timestamp)
                        if p.uncertainty.IsUncertain(p.curUnsafeKey.Timestamp, localTS) {
                                return p.uncertaintyError(p.curUnsafeKey.Timestamp, localTS), false
                        }
                }
        }

        p.decrementItersBeforeSeek()
        if !p.iterSeek(seekKey) {
                p.setAdvanceKeyAtEnd()
                return true /* ok */, false
        }
        for {
                if !bytes.Equal(p.curUnsafeKey.Key, origKey) {
                        p.setAdvanceKeyAtNewKey(origKey)
                        return true /* ok */, false
                }
                v, valid := p.getFromLazyValue()
                if !valid {
                        return false, false
                }

                uncertaintyCheckRequired := uncertaintyCheck && !p.curUnsafeKey.Timestamp.LessEq(p.ts)
                if !p.mvccHeaderRequired(uncertaintyCheckRequired) {
                        if !p.decodeCurrentValueIgnoringHeader(v) {
                                return false, false
                        }
                } else if extended, valid := p.tryDecodeCurrentValueSimple(v); !valid {
                        return false, false
                } else if extended {
                        if !p.decodeCurrentValueExtended(v) {
                                return false, false
                        }
                }
                if !uncertaintyCheckRequired {
                        if rkv, ok := p.coveredByRangeKey(p.curUnsafeKey.Timestamp); ok {
                                return p.addSynthetic(ctx, p.curUnsafeKey.Key, rkv)
                        }
                        return p.add(ctx, p.curUnsafeKey.Key, p.curRawKey, p.curUnsafeValue.Value.RawBytes, v)
                }
                // Iterate through uncertainty interval. See the comment above about why
                // a value in this interval is not necessarily cause for an uncertainty
                // error.
                localTS := p.curUnsafeValue.GetLocalTimestamp(p.curUnsafeKey.Timestamp)
                if p.uncertainty.IsUncertain(p.curUnsafeKey.Timestamp, localTS) {
                        return p.uncertaintyError(p.curUnsafeKey.Timestamp, localTS), false
                }
                if !p.iterNext() {
                        p.setAdvanceKeyAtEnd()
                        return true /* ok */, false
                }
        }
}

// coveredByRangeKey returns the topmost range key at the current position
// between the given timestamp and the read timestamp p.ts, if any.
//
// gcassert:inline
func (p *pebbleMVCCScanner) coveredByRangeKey(ts hlc.Timestamp) (rkv MVCCRangeKeyVersion, ok bool) {
        // This code is a bit odd to fit it within the mid-stack inlining budget. We
        // can't use p.curRangeKeys.IsEmpty(), nor early returns.
        if len(p.curRangeKeys.Versions) > 0 {
                rkv, ok = p.doCoveredByRangeKey(ts)
        }
        return rkv, ok
}

// doCoveredByRangeKey is a helper for coveredByRangeKey to allow mid-stack
// inlining.  It is only called when there are range keys present.
func (p *pebbleMVCCScanner) doCoveredByRangeKey(ts hlc.Timestamp) (MVCCRangeKeyVersion, bool) {
        // In the common case when tombstones are disabled, range key masking will be
        // enabled and so the point key will generally always be above the upper range
        // key (unless we're reading in the past). We fast-path this here.
        if p.tombstones || ts.LessEq(p.curRangeKeys.Newest()) {
                if rkv, ok := p.curRangeKeys.FirstAtOrBelow(p.ts); ok && ts.LessEq(rkv.Timestamp) {
                        return rkv, true
                }
        }
        return MVCCRangeKeyVersion{}, false
}

// processRangeKeys will check for any newly encountered MVCC range keys (as
// determined by RangeKeyChanged), perform conflict checks for them, decode them
// into p.curRangeKeys, and skip across bare range keys until positioned on a
// point key or exhausted iterator. It must be called after every iterator
// positioning operation, to make sure it sees the RangeKeyChanged signal.
// Requires a valid iterator. Returns true if iteration can continue.
//
// seeked must be set to true following an iterator seek operation. In the
// forward direction, bare range keys are only possible with RangeKeyChanged or
// SeekGE, which allows omitting HasPointAndRange calls in the common Next case.
// It's also required to handle the case where the scanner is given a used
// iterator that may already be positioned on a range key such that the initial
// seek won't trigger RangeKeyChanged.
//
// reverse must be set to true if the previous iterator operation was a reverse
// operation (SeekLT or Prev). This determines the direction to skip in, and
// also requires checking for bare range keys after every step, since we'll
// land on them last.
func (p *pebbleMVCCScanner) processRangeKeys(seeked bool, reverse bool) bool {

        // Look for new range keys to process, and step across bare range keys until
        // we land on a point key (or exhaust the iterator).
        for {
                // In the forward direction, we can only land on a bare range key when
                // RangeKeyChanged fires (at its start bound) or when we SeekGE within it.
                rangeKeyChanged := p.parent.RangeKeyChanged()
                if !rangeKeyChanged && !reverse && !seeked {
                        return true
                }

                // We fast-path the common no-range-key case.
                hasPoint, hasRange := p.parent.HasPointAndRange()
                if !hasRange {
                        p.curRangeKeys = MVCCRangeKeyStack{}
                        return true
                }

                // Process new range keys. On the initial seek it's possible that we're
                // given an iterator that's already positioned on a range key, so
                // RangeKeyChanged won't fire -- we handle that case here as well.
                if rangeKeyChanged || (seeked && p.curRangeKeys.IsEmpty()) {
                        p.curRangeKeys = p.parent.RangeKeys()

                        // Check for conflicts with range keys at or above the read timestamp.
                        // We don't need to handle e.g. skipLocked, because range keys don't
                        // currently have intents.
                        if p.failOnMoreRecent {
                                if key := p.parent.UnsafeKey(); !hasPoint || !key.Timestamp.IsEmpty() {
                                        if newest := p.curRangeKeys.Newest(); p.ts.LessEq(newest) {
                                                if p.mostRecentTS.Forward(newest) {
                                                        p.mostRecentKey = append(p.mostRecentKey[:0], key.Key...)
                                                }
                                        }
                                }
                        }

                        // Check if any of the range keys are in the uncertainty interval.
                        if p.checkUncertainty {
                                for _, version := range p.curRangeKeys.Versions {
                                        if version.Timestamp.LessEq(p.ts) {
                                                break
                                        }
                                        var value MVCCValue
                                        var simple bool
                                        value, simple, p.err = tryDecodeSimpleMVCCValue(version.Value)
                                        if !simple && p.err == nil {
                                                value, p.err = decodeExtendedMVCCValue(version.Value, true)
                                        }
                                        if p.err != nil {
                                                return false
                                        }
                                        localTS := value.GetLocalTimestamp(version.Timestamp)
                                        if p.uncertainty.IsUncertain(version.Timestamp, localTS) {
                                                return p.uncertaintyError(version.Timestamp, localTS)
                                        }
                                }
                        }
                }

                // If we're on a point key we're done, otherwise keep stepping.
                if hasPoint {
                        return true
                }
                if !reverse {
                        p.parent.Next()
                } else {
                        p.parent.Prev()
                }
                if !p.iterValid() {
                        return false
                }
        }
}

// Updates cur{RawKey, UnsafeKey, RawValue} to match record the iterator is
// pointing to. Callers should call decodeCurrent{Metadata, Value} to decode
// the raw value if they need it.
//
// Must only be called with a valid iterator.
func (p *pebbleMVCCScanner) updateCurrent() bool {
        p.curRawKey = p.parent.UnsafeRawMVCCKey()

        var err error
        p.curUnsafeKey, err = DecodeMVCCKey(p.curRawKey)
        if err != nil {
                p.err = errors.Wrap(err, "unable to decode MVCCKey")
                return false
        }
        p.curRawValue = p.parent.UnsafeLazyValue()

        // Reset decoded value to avoid bugs.
        if util.RaceEnabled {
                p.meta = enginepb.MVCCMetadata{}
                p.curUnsafeValue = MVCCValue{}
        }
        return true
}

func (p *pebbleMVCCScanner) getFromLazyValue() (v []byte, valid bool) {
        v, callerOwned, err := p.curRawValue.Value(p.lazyValueBuf)
        if err != nil {
                p.err = err
                return nil, false
        }
        if callerOwned {
                p.lazyValueBuf = v[:0]
        }
        return v, true
}

func (p *pebbleMVCCScanner) decodeCurrentMetadata() bool {
        val, valid := p.getFromLazyValue()
        if !valid {
                return false
        }
        if len(val) == 0 {
                p.err = errors.Errorf("zero-length mvcc metadata")
                return false
        }
        err := protoutil.Unmarshal(val, &p.meta)
        if err != nil {
                p.err = errors.Wrap(err, "unable to decode MVCCMetadata")
                return false
        }
        return true
}

// mvccHeaderRequired returns true if the caller should fully
// unmarshal the MVCCValueHeader when parsing an MVCCValue.
//
// The passed bool indicates whether the caller needs the
// MVCCValueHeader because they are going to do an uncertainty check,
// which may require the LocalTimestamp stored in the MVCCValueHeader
//
//gcassert:inline
func (p *pebbleMVCCScanner) mvccHeaderRequired(uncertaintyCheckRequired bool) bool {
        return uncertaintyCheckRequired || p.decodeMVCCHeaders
}

//gcassert:inline
func (p *pebbleMVCCScanner) decodeCurrentValueIgnoringHeader(v []byte) bool {
        p.curUnsafeValue, p.err = decodeMVCCValueIgnoringHeader(v)
        return p.err == nil
}

//gcassert:inline
func (p *pebbleMVCCScanner) tryDecodeCurrentValueSimple(v []byte) (extended bool, valid bool) {
        var simple bool
        p.curUnsafeValue, simple, p.err = tryDecodeSimpleMVCCValue(v)
        return !simple, p.err == nil
}

//gcassert:inline
func (p *pebbleMVCCScanner) decodeCurrentValueExtended(v []byte) bool {
        p.curUnsafeValue, p.err = decodeExtendedMVCCValue(v, true)
        return p.err == nil
}

func (p *pebbleMVCCScanner) iterValid() bool {
        if valid, err := p.parent.Valid(); !valid {
                // Defensive: unclear if p.err can already be non-nil here, but
                // regardless, don't overwrite unless we have a non-nil err.
                if err != nil {
                        p.err = err
                }
                return false
        }
        return true
}

// iterSeek seeks to the latest revision of the specified key (or a greater key).
func (p *pebbleMVCCScanner) iterSeek(key MVCCKey) bool {
        p.clearPeeked()
        p.parent.SeekGE(key)
        if !p.iterValid() {
                return false
        }
        if !p.processRangeKeys(true /* seeked */, false /* reverse */) {
                return false
        }
        return p.updateCurrent()
}

// iterSeekReverse seeks to the latest revision of the key before the specified key.
func (p *pebbleMVCCScanner) iterSeekReverse(key MVCCKey) bool {
        p.clearPeeked()
        p.parent.SeekLT(key)
        if !p.iterValid() {
                return false
        }
        if !p.processRangeKeys(true /* seeked */, true /* reverse */) {
                return false
        }
        if !p.updateCurrent() {
                // We have seeked to before the start key. Return.
                return false
        }

        if p.curUnsafeKey.Timestamp.IsEmpty() {
                // We landed on an intent or inline value.
                return true
        }
        // We landed on a versioned value, we need to back up to find the
        // latest version.
        return p.backwardLatestVersion(p.curUnsafeKey.Key, 0)
}

// iterNext advances to the next MVCC key.
func (p *pebbleMVCCScanner) iterNext() bool {
        if p.reverse && p.peeked {
                // If we have peeked at the previous entry, we need to advance the iterator
                // to get back to the current entry. If we've peeked off the beginning,
                // it's okay to Next: Pebble will reposition to the first visible key.
                p.peeked = false
                p.parent.Next()
                // We don't need to process range key changes here, because curRangeKeys
                // already contains the range keys at this position from before the peek.
                if buildutil.CrdbTestBuild {
                        p.assertOwnedRangeKeys()
                }
                if !p.iterValid() {
                        return false
                }
        }
        // Step forward from the current entry.
        p.parent.Next()
        if !p.iterValid() {
                return false
        }
        if !p.processRangeKeys(false /* seeked */, false /* reverse */) {
                return false
        }
        return p.updateCurrent()
}

// iterPrev advances to the previous MVCC Key.
func (p *pebbleMVCCScanner) iterPrev() bool {
        if p.peeked {
                p.peeked = false
        } else {
                p.parent.Prev()
        }
        if !p.iterValid() {
                return false
        }
        if !p.processRangeKeys(false /* seeked */, true /* reverse */) {
                return false
        }
        return p.updateCurrent()
}

// Peek the previous key and store the result in peekedKey, also returning
// whether the peeked key had a point key or only a bare range key. Note that
// this moves the iterator backward, while leaving p.cur{key,value,rawKey,RangeKeys}
// untouched and therefore out of sync. iterPrev and iterNext take this into
// account.
//
// NB: Unlike iterPrev() and iterNext(), iterPeekPrev() will not skip across
// bare range keys: we have to do conflict checks on any new range keys when we
// step onto them, which may happen on the next positioning operation. The
// returned hasPoint value will indicate whether the peeked position is a bare
// range key or not.
func (p *pebbleMVCCScanner) iterPeekPrev() ([]byte, bool, bool) {
        if !p.peeked {
                p.peeked = true
                // We need to save a copy of the current iterator key and value and adjust
                // curRawKey, curKey and curValue to point to this saved data. We use a
                // single buffer for this purpose: savedBuf.
                p.savedBuf = append(p.savedBuf[:0], p.curRawKey...)
                p.curRawValue, p.savedBuf = p.curRawValue.Clone(p.savedBuf, &p.lazyFetcherBuf)
                p.curRawKey = p.savedBuf[:len(p.curRawKey)]
                // The raw key is always a prefix of the encoded MVCC key. Take advantage of this to
                // sub-slice the raw key directly, instead of calling SplitMVCCKey.
                p.curUnsafeKey.Key = p.curRawKey[:len(p.curUnsafeKey.Key)]
                // We need to save copies of the current range keys too, but we can avoid
                // this if we already saved them previously (if cur and saved share memory).
                if curStart := p.curRangeKeys.Bounds.Key; len(curStart) > 0 {
                        savedStart := p.savedRangeKeys.Bounds.Key
                        if len(curStart) != len(savedStart) || &curStart[0] != &savedStart[0] {
                                p.curRangeKeys.CloneInto(&p.savedRangeKeys)
                                p.curRangeKeys = p.savedRangeKeys
                        }
                }

                // With the current iterator state saved we can move the iterator to the
                // previous entry.
                p.parent.Prev()
                if !p.iterValid() {
                        // The iterator is now invalid, but note that this case is handled in
                        // both iterNext and iterPrev. In the former case, we'll position the
                        // iterator at the first entry, and in the latter iteration will be done.
                        return nil, false, false
                }
        } else if !p.iterValid() {
                return nil, false, false
        }

        peekedKey := p.parent.UnsafeKey()

        // We may land on a bare range key without RangeKeyChanged firing, but only at
        // its start bound where the timestamp must be empty. HasPointAndRange() is
        // not cheap, so we only check it when necessary.
        hasPoint := true
        if peekedKey.Timestamp.IsEmpty() {
                hasPoint, _ = p.parent.HasPointAndRange()
        }

        return peekedKey.Key, hasPoint, true
}

// Clear the peeked flag. Call this before any iterator operations.
func (p *pebbleMVCCScanner) clearPeeked() {
        if p.reverse {
                p.peeked = false
        }
}

// isKeyLockedByConflictingTxn consults the in-memory lock table to determine
// whether the provided key is locked with an unreplicated lock by a different
// txn. When p.skipLocked, this method should be called before adding a key to
// the scan's result set or throwing a write too old error on behalf of a key.
// If the key is locked, skipLocked instructs the scan to skip over it instead.
func (p *pebbleMVCCScanner) isKeyLockedByConflictingTxn(
        ctx context.Context, rawKey []byte,
) (locked, ok bool) {
        key, _, err := enginepb.DecodeKey(rawKey)
        if err != nil {
                p.err = err
                return false, false
        }
        ok, txn, err := p.lockTable.IsKeyLockedByConflictingTxn(ctx, key)
        if err != nil {
                p.err = err
                return false, false
        }
        if ok {
                // The key is locked or reserved, so ignore it.
                if txn != nil && (p.maxLockConflicts == 0 || int64(p.intents.Count()) < p.maxLockConflicts) {
                        // However, if the key is locked, we return the lock holder separately
                        // (if we have room); the caller may want to resolve it.
                        if !p.addKeyAndMetaAsIntent(ctx, key, txn) {
                                return false, false
                        }
                }
                return true, true
        }
        return false, true
}

// addCurIntent adds the key-value pair that the scanner is currently
// pointing to as an intent to the intents set.
func (p *pebbleMVCCScanner) addCurIntent(ctx context.Context) bool {
        v, valid := p.getFromLazyValue()
        if !valid {
                return false
        }
        return p.addRawIntent(ctx, p.curRawKey, v)
}

// addKeyAndMetaAsIntent adds the key and transaction meta as an intent to
// the intents set.
func (p *pebbleMVCCScanner) addKeyAndMetaAsIntent(
        ctx context.Context, key roachpb.Key, txn *enginepb.TxnMeta,
) bool {
        if txn == nil {
                p.err = errors.AssertionFailedf("nil txn passed to addKeyAndMetaAsIntent")
                return false
        }
        mvccKey := MakeMVCCMetadataKey(key)
        mvccVal := enginepb.MVCCMetadata{Txn: txn}
        encodedKey := EncodeMVCCKey(mvccKey)
        encodedVal, err := protoutil.Marshal(&mvccVal)
        if err != nil {
                p.err = err
                return false
        }
        return p.addRawIntent(ctx, encodedKey, encodedVal)
}

func (p *pebbleMVCCScanner) addRawIntent(ctx context.Context, key, value []byte) bool {
        // p.intents is a pebble.Batch which grows its byte slice capacity in
        // chunks to amortize allocations. The memMonitor is under-counting here
        // by only accounting for the key and value bytes.
        if p.err = p.memAccount.Grow(ctx, int64(len(key)+len(value))); p.err != nil {
                p.err = IncludeStartKeyIntoErr(p.start, p.err)
                return false
        }
        p.err = p.intents.Set(key, value, nil)
        return p.err == nil
}

func (p *pebbleMVCCScanner) intentsRepr() []byte {
        if p.intents.Count() == 0 {
                return nil
        }
        return p.intents.Repr()
}

// assertOwnedRangeKeys asserts that p.curRangeKeys is empty, or backed by
// p.savedRangeKeys's buffers.
func (p *pebbleMVCCScanner) assertOwnedRangeKeys() {
        if p.curRangeKeys.IsEmpty() {
                return
        }
        // NB: We compare on the EndKey in case the start key is /Min, the empty
        // key.
        if &p.curRangeKeys.Bounds.EndKey[0] != &p.savedRangeKeys.Bounds.EndKey[0] {
                panic(errors.AssertionFailedf("current range keys are not scanner-owned"))
        }
}

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/util"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/errors"
)

// ReadAsOfIterator wraps a SimpleMVCCIterator and only surfaces the latest
// valid point key of a given MVCC key that is also below the asOf timestamp, if
// set. Further, the iterator does not surface point or range tombstones, nor
// any MVCC keys shadowed by tombstones below the asOf timestamp, if set. The
// iterator assumes that it will not encounter any write intents.
type ReadAsOfIterator struct {
        iter SimpleMVCCIterator

        // asOf is the latest timestamp of a key surfaced by the iterator.
        asOf hlc.Timestamp

        // valid tracks if the current key is valid
        valid bool

        // err tracks if iterating to the current key returned an error
        err error

        // newestRangeTombstone contains the timestamp of the latest range
        // tombstone below asOf at the current position, if any.
        newestRangeTombstone hlc.Timestamp
}

var _ SimpleMVCCIterator = &ReadAsOfIterator{}

// Close closes the underlying iterator.
func (f *ReadAsOfIterator) Close() {
        f.iter.Close()
}

// SeekGE advances the iterator to the first key in the engine which is >= the
// provided key that obeys the ReadAsOfIterator key constraints.
func (f *ReadAsOfIterator) SeekGE(originalKey MVCCKey) {
        // To ensure SeekGE seeks to a key that isn't shadowed by a tombstone that the
        // ReadAsOfIterator would have skipped (i.e. a tombstone below asOf), seek to
        // the key with the latest possible timestamp that the iterator could surface
        // (i.e. asOf, if set) and iterate to the next valid key at or below the caller's
        // key that also obeys the iterator's constraints.
        synthetic := MVCCKey{Key: originalKey.Key, Timestamp: f.asOf}
        f.iter.SeekGE(synthetic)

        if f.advance(true /* seeked */); f.valid && f.UnsafeKey().Less(originalKey) {
                // The following is true:
                // originalKey.Key == f.UnsafeKey &&
                // f.asOf timestamp (if set) >= current timestamp > originalKey timestamp.
                //
                // This implies the caller is seeking to a key that is shadowed by a valid
                // key that obeys the iterator 's constraints. The caller's key is NOT the
                // latest key of the given MVCC key; therefore, skip to the next MVCC key.
                f.NextKey()
        }
}

// Valid implements the simpleMVCCIterator.
func (f *ReadAsOfIterator) Valid() (bool, error) {
        if util.RaceEnabled && f.valid {
                if err := f.assertInvariants(); err != nil {
                        return false, err
                }
        }
        return f.valid, f.err
}

// Next advances the iterator to the next valid MVCC key obeying the iterator's
// constraints. Note that Next and NextKey have the same implementation because
// the iterator only surfaces the latest valid key of a given MVCC key below the
// asOf timestamp.
func (f *ReadAsOfIterator) Next() {
        f.NextKey()
}

// NextKey advances the iterator to the next valid MVCC key obeying the
// iterator's constraints. NextKey() is only guaranteed to surface a key that
// obeys the iterator's constraints if the iterator was already on a key that
// obeys the constraints. To ensure this, initialize the iterator with a SeekGE
// call before any calls to NextKey().
func (f *ReadAsOfIterator) NextKey() {
        f.iter.NextKey()
        f.advance(false /* seeked */)
}

// UnsafeKey returns the current key, but the memory is invalidated on the next
// call to {NextKey,Seek}.
func (f *ReadAsOfIterator) UnsafeKey() MVCCKey {
        return f.iter.UnsafeKey()
}

// UnsafeValue returns the current value as a byte slice, but the memory is
// invalidated on the next call to {NextKey,Seek}.
func (f *ReadAsOfIterator) UnsafeValue() ([]byte, error) {
        return f.iter.UnsafeValue()
}

// MVCCValueLenAndIsTombstone implements the SimpleMVCCIterator interface.
func (f *ReadAsOfIterator) MVCCValueLenAndIsTombstone() (int, bool, error) {
        return f.iter.MVCCValueLenAndIsTombstone()
}

// ValueLen implements the SimpleMVCCIterator interface.
func (f *ReadAsOfIterator) ValueLen() int {
        return f.iter.ValueLen()
}

// HasPointAndRange implements SimpleMVCCIterator.
func (f *ReadAsOfIterator) HasPointAndRange() (bool, bool) {
        return true, false
}

// RangeBounds always returns an empty span, since the iterator never surfaces
// rangekeys.
func (f *ReadAsOfIterator) RangeBounds() roachpb.Span {
        return roachpb.Span{}
}

// RangeKeys is always empty since this iterator never surfaces rangeKeys.
func (f *ReadAsOfIterator) RangeKeys() MVCCRangeKeyStack {
        return MVCCRangeKeyStack{}
}

// RangeKeyChanged implements SimpleMVCCIterator.
func (f *ReadAsOfIterator) RangeKeyChanged() bool {
        return false
}

// updateValid updates i.valid and i.err based on the underlying iterator, and
// returns true if valid.
func (f *ReadAsOfIterator) updateValid() bool {
        f.valid, f.err = f.iter.Valid()
        return f.valid
}

// advance moves past keys with timestamps later than f.asOf and skips MVCC keys
// whose latest value (subject to f.asOF) has been deleted by a point or range
// tombstone.
func (f *ReadAsOfIterator) advance(seeked bool) {
        for {
                if ok := f.updateValid(); !ok {
                        return
                }

                // Detect range tombstones, and step forward to the next key if on a bare
                // range key.
                if seeked || f.iter.RangeKeyChanged() {
                        seeked = false
                        hasPoint, hasRange := f.iter.HasPointAndRange()
                        f.newestRangeTombstone = hlc.Timestamp{}
                        if hasRange {
                                if v, ok := f.iter.RangeKeys().FirstAtOrBelow(f.asOf); ok {
                                        f.newestRangeTombstone = v.Timestamp
                                }
                        }
                        if !hasPoint {
                                f.iter.Next()
                                continue
                        }
                }

                // Process point keys.
                if key := f.iter.UnsafeKey(); f.asOf.Less(key.Timestamp) {
                        // Skip keys above the asOf timestamp.
                        f.iter.Next()
                } else {
                        v, err := f.iter.UnsafeValue()
                        if err != nil {
                                f.valid, f.err = false, err
                                return
                        }
                        if isTombstone, err := EncodedMVCCValueIsTombstone(v); err != nil {
                                f.valid, f.err = false, err
                                return
                        } else if isTombstone {
                                // Skip to the next MVCC key if we find a point tombstone.
                                f.iter.NextKey()
                        } else if key.Timestamp.LessEq(f.newestRangeTombstone) {
                                // The latest range key, as of system time, shadows the latest point key.
                                // This key is therefore deleted as of system time.
                                f.iter.NextKey()
                        } else {
                                // On a valid key that potentially shadows range key(s).
                                return
                        }
                }
        }
}

// NewReadAsOfIterator constructs a ReadAsOfIterator. If asOf is not set, the
// iterator reads the most recent data.
func NewReadAsOfIterator(iter SimpleMVCCIterator, asOf hlc.Timestamp) *ReadAsOfIterator {
        if asOf.IsEmpty() {
                asOf = hlc.MaxTimestamp
        }
        return &ReadAsOfIterator{iter: iter, asOf: asOf}
}

// assertInvariants asserts iterator invariants. The iterator must be valid.
func (f *ReadAsOfIterator) assertInvariants() error {
        // Check general SimpleMVCCIterator API invariants.
        if err := assertSimpleMVCCIteratorInvariants(f); err != nil {
                return err
        }

        // asOf must be set.
        if f.asOf.IsEmpty() {
                return errors.AssertionFailedf("f.asOf is empty")
        }

        // The underlying iterator must be valid.
        if ok, err := f.iter.Valid(); !ok || err != nil {
                errMsg := err.Error()
                return errors.AssertionFailedf("invalid underlying iter with err=%s", errMsg)
        }

        // Keys can't be intents or inline values, and must have timestamps at or
        // below the readAsOf timestamp.
        key := f.UnsafeKey()
        if key.Timestamp.IsEmpty() {
                return errors.AssertionFailedf("emitted key %s has no timestamp", key)
        }
        if f.asOf.Less(key.Timestamp) {
                return errors.AssertionFailedf("emitted key %s above asOf timestamp %s", key, f.asOf)
        }

        // Tombstones should not be emitted.
        if _, isTombstone, err := f.MVCCValueLenAndIsTombstone(); err != nil {
                return errors.NewAssertionErrorWithWrappedErrf(err, "invalid value")
        } else if isTombstone {
                return errors.AssertionFailedf("emitted tombstone for key %s", key)
        }

        return nil
}

// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"

        "github.com/cockroachdb/cockroach/pkg/raft/raftpb"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/redact"
        "github.com/cockroachdb/redact/interfaces"
)

// TODO(sumeer):
// Interface and design:
// - Pick names for C1-C3 and D1-D2, to make it easier to remember what we are
//   referring to in various places.
// - Consider separating RecoveryInconsistentReplica into two different states
//   RecoveryRaftAndStateInconsistent, and RecoveryStateInconsistent.
// - Proof sketch.
// - Read cockroach debug check-store to see additional assertion ideas/code
//   we can lift.

// TODO(sumeer):
// See partial prototype in https://github.com/cockroachdb/cockroach/pull/88606.
//
// Implementation:
// - Implement interface.
// - Unit tests and randomized tests, including engine restarts that lose
//   state (using vfs.NewCrashableMem).
// - Benchmarks comparing single and two engine implementations.
// - Race-build dynamically asserts that SSTs or MutationBatches that are
//   passed through this interface only touch the keys they are allowed to
//   touch.
// - Integration (can be done incrementally).
// - Misc cleanup:
//   - Merges should cleanup QueueLastProcessedKey,

// High-level overview:
//
// ReplicasStorage provides an interface to manage the persistent state that
// includes the lifecycle of a range replica, its raft log, and the state
// machine state. The implementation(s) are expected to be a stateless wrapper
// around persistent state in the underlying engine(s) (any state they
// maintain in-memory is simply a performance optimization and always
// in-sync with the persistent state). Since this abstraction is mutating the
// same underlying engine state that was previously mutated via lower-level
// interfaces, and is not a data-structure in the usual sense, we can migrate
// callers incrementally to use this interface. That is, callers that use this
// interface, and those that use the lower-level engine interfaces can
// co-exist correctly.
//
// TODO(sumeer): this co-existence is not completely true since the following
// attempts to define an ideal interface where no sst or MutationBatch touches
// both raft engine state or state machine engine state. Which means transient
// inconsistencies can develop. We will either
// - alter this interface to be more pragmatic, i.e., be a step towards the
//   ideal interface, but not the end product, once we have settled on the
//   ideal interface.
// - ensure that the minimal integration step includes ReplicasStorage.Init,
//   which can eliminate any inconsistencies caused by an inopportune crash.
// Hopefully, the latter is sufficient.
//
// We consider the following distinct kinds of persistent state:
// - State machine state: It contains all replicated keys: replicated range-id
//   local keys, range local keys, range lock keys, global keys. NB: this
//   includes the RangeAppliedState and the RangeDescriptor.
//
// - Raft state: This includes all the unreplicated range-ID local key names
//   prefixed by Raft. We will loosely refer to all of these as "raft state".
//   RangeLastReplicaGCTimestamp changes are ignored below, since it is
//   best-effort persistent state used to pace queues, and the caller is
//   allowed to mutate it out-of-band. However when deleting a replica,
//   ReplicasStorage will clear that key too. RangeLastReplicaGCTimestamp is
//   placed in this state category because it is not replicated state machine
//   state.
//
// The interface requires that any mutation (batch or sst) only touch one of
// these kinds of state. This discipline will allow us to eventually separate
// the engines containing these two kinds of state. This interface is not
// relevant for store local keys though they will be in the latter engine. The
// interface does not allow the caller to specify whether to sync a mutation
// to the raft log or state machine state -- that decision is left to the
// implementation of ReplicasStorage (with a couple of small exceptions where
// a sync is explicitly requested, which are explained later). So even when we
// don't separate the state machine and raft engines, this abstraction forces
// us to reason more carefully about effects of crashes, and when to sync, and
// allow us to test more thoroughly.
//
// RangeTombstoneKey: This is an unreplicated key that is critical to the
// replica lifecycle. Since it is unreplicated, it is not part of the state
// machine. However, placing it in the category of "raft state" with the other
// unreplicated keys turns out to be complicated:
// (a) in various range merge situations (including replicas being subsumed
// during snapshot application) we need to atomically move the state machine
// forward for the surviving range, delete the state machine state for the
// subsumed range(s) and set the RangeTombstone.
// (b) when removing a replica due to rebalancing we need to atomically remove
// the state machine and set the RangeTombstone.
// For these reasons, we require that the RangeTombstone be in the same engine
// as the state machine state. However, it can only be mutated by
// ReplicasStorage.
//
// Note about terminology as pertaining to range-id keys: "range-id local
// keys" and "range-id keys" are the same thing, since all range-id keys are
// local. As documented in the keys package, range-id keys can be replicated
// or unreplicated. All the replicated range-id keys plus the
// RangeTombstoneKey (which is unreplicated) are referred to as "range-id
// state machine" keys. All the remaining unreplicated range-id keys belong to
// raft state and are referred to as "range-id raft" keys or simply "raft"
// keys (since all raft keys are also unreplicated range-id keys).
//
// Note that the interface is not currently designed such that raft log writes
// avoid syncing to disk as discussed in
// https://github.com/cockroachdb/cockroach/issues/17500#issuecomment-727094672
// and followup comments on that issue. However, having a clean storage
// abstraction should be a reasonable step in that direction.
//
// ReplicasStorage does not interpret most of the data in the state machine.
// It expects mutations to that state to be provided as an opaque batch, or a
// set of files to be ingested. There are a few exceptions where it can read
// state machine state, mainly when recovering from a crash, so as to make
// changes to get to a consistent state.
// - RangeAppliedStateKey: needs to read this in order to truncate the log,
//   both as part of regular log truncation (see the comment section on
//   "Normal Replica Operation") and on crash recovery (see the comment
//   section on "Replica Initialization" and "Crash Recovery" for details).
// - RangeDescriptorKey: needs to read this to discover the spans of
//   initialized replicas (see the comment sections on "Invariants" and "Crash
//   Recovery").
//
// A corollary to this lack of interpretation is that reads of the state
// machine are not handled by this interface, though it does expose some
// metadata in case the reader want to be sure that the replica it is trying
// to read actually exists in storage. ReplicasStorage also does not offer an
// interface to construct changes to the state machine state. It applies
// changes, and requires the caller to obey some simple invariants to not
// cause inconsistencies. It is aware of the keyspace occupied by a replica
// and the difference between range-ID keys and range keys -- it needs this
// awareness to discard (parts of) replica state when replicas are moved or
// merged away.
//
// ReplicasStorage does interpret the raft state (all the unreplicated
// range-ID local key names prefixed by Raft), and the RangeTombstoneKey. This
// is necessary for it to be able to maintain invariants spanning the raft log
// and the state machine (related to raft log truncation, replica lifetime
// etc.), including reapplying raft log entries on restart, to the state
// machine. All accesses (read or write) to the raft log and RangeTombstoneKey
// must happen via ReplicasStorage. ReplicasStorage does not by itself apply
// committed raft log entries to the state machine in a running system -- this
// is because state machine application in a running system has complex
// data-structure side-effects that are outside the scope of ReplicasStorage.
// It is told by the caller to apply a committed entry, which also requires
// the caller to provide the state machine changes. ReplicasStorage does apply
// "simple" entries directly to the state machine during Init to fix any
// inconsistency of the state machine caused by durable sst ingestion and
// non-durable batch application (see ApplyCommitted* methods in the
// interface). Since these could be preceded by non-durable configuration
// changes, the notion of "simple" entries includes configuration changes,
// except for splits and merges (which we sync to ensure durability -- this is
// justified in the section below on "Implementation constraints on
// ReplicasStorage").
//
// TODO(sumeer):
// https://github.com/etcd-io/etcd/issues/7625#issuecomment-489232411 relies
// on a correctness argument based on bounded regression of conf changes.
// Consider strengthening that correctness argument by making the committed
// index durable for a conf change before applying it. We could introduce a
// `ApplyConfChange(MutationBatch, highestRaftIndex uint64)` method, like we
// have for ingestion, and first sync the Commit state if needed. That way we
// will not have any conf change regression.
//
// ============================================================================
// Invariants:
//
// INVARIANT (RaftAndStateConsistency): when there is any data in the state
// machine associated with a given RangeID, there is a corresponding
// internally consistent Raft state (according to etcd/raft) that is also
// consistent with the applied state on the state machine (i.e. the latter
// references a valid log position). Specifically,
// - HardState.Commit >= RangeAppliedState.RaftAppliedIndex
// - if HardState.Commit > RangeAppliedState.RaftAppliedIndex, it points to an
//   entry in the raft log.
// - RaftTruncatedState.{Index,Term} must be a valid value corresponding to
//   what was truncated. If there are no raft log entries,
//   RaftTruncatedState.{Index,Term} must equal
//   RangeAppliedState.{RaftAppliedIndex,RaftAppliedIndexTerm}.
//
// INVARIANT (StateConsistency): when there is any data in the state machine
// associated with a given RangeID, it will reflect the replicated state at
// the corresponding applied index (i.e., it materializes the replicated log
// at this index).
// Additionally, a range is first created with a RangeDescriptor present
// (external invariant) and neither the callers nor ReplicasStorage will ever
// selectively delete it. NOTE: subsumption as part of a range merge does
// delete the RangeDescriptor, but the Replica atomically ceases to exist in
// the process.
// Specifically,
// - The state machine state must be consistent with the value of
//   RaftAppliedIndex, i.e., it equals a state generated from the full history
//   of this range (for a range that has never been the LHS of a merge, this
//   is the initial snapshot when the range came into being, followed by all
//   subsequent raft log entries).
// - RaftAppliedIndex >= RaftInitialLogIndex
// - RaftAppliedIndexTerm >= RaftInitialLogTerm
// - Has at least 1 non-provisional RangeDescriptor.
// - Regression of the HardState.Commit and RaftAppliedIndex is permitted due
//   to a crash except for the following:
//   - Split that has progressed to applying a state machine change that
//     results in a non-provisional RangeDescriptor for the RHS must not
//     regress after the crash (i.e., must sync application of the split
//     trigger).
//   - Merge that has progressed to applying a state machine change that
//     deletes the RangeDescriptor for the RHS must not regress after the
//     crash (i.e., must sync application of the merge trigger).
//   One could possibly relax these split/merge invariants but the corner
//   cases are very subtle and make it hard to reason about correctness.
//   As an example, see the discussion about "not syncing for splits" in
//   https://github.com/cockroachdb/cockroach/pull/72745#pullrequestreview-807989047
//
// INVARIANT (InterReplicaStateConsistency): The latest non-provisional
// RangeDescriptors of replicas with state machine state have spans that do
// not overlap. We use the term replica-descriptor to refer to this latest
// non-provisional RangeDescriptor, in the text below.
//
// DEFINITION (InitializedStateMachine): a Replica with state
// InitializedStateMachine, has state machine state and obeys the invariants
// RaftAndStateConsistency, StateConsistency, InterReplicaStateConsistency.
//
// DEFINITION (DeletedReplica): it can be convenient to reference Replicas
// that once existed but no longer do, as evidenced by the presence of a
// RangeTombstone for a RangeID, but no state machine or raft state.
// RangeTombstone.NextReplicaID is populated with a value > the last ReplicaID
// seen by ReplicasStorage for this range. Note that RangeTombstone is
// populated even for ranges that no longer exist (RHS of a merge) -- in this
// case it is set to a constant (mergedTombstoneReplicaID, equal to MaxInt32).
//
// DEFINITION (UninitializedStateMachine): this is a Replica with no state
// machine, i.e., there is Raft state and possibly a RangeTombstone. In
// particular, there is no RangeDescriptor and so it has no key span
// associated with it yet. The RangeTombstone.NextReplicaID is <=
// RaftReplicaID.ReplicaID.
// The HardState{Term,Vote} can have arbitrary values since this replica can
// vote. However, it has a zero HardState.Commit and no log entries -- this
// Raft invariant is upheld externally by a combination of mostly external
// invariants:
// A new Range is initialized with all Replicas at truncated index equal to
// RaftInitialLogIndex (10) (so they are in InitializedStateMachine state),
// and any future Replicas will be initialized via a snapshot reflecting a
// nonzero applied index >= 10. In particular, prior to receiving the
// snapshot, no log entries can be sent to the Replica. And etcd/raft only
// communicates Commit entries for which the recipient has the log entry.
//
// Some of the above invariants may be violated when non-durable state is lost
// due to a crash, but ReplicasStorage.Init is required to fix the persistent
// state such that the above invariants are true. These are not exposed to the
// user of the interface. We list these below.
//
// DEFINITION (RecoveryDeletingReplica): a Replica whose Raft state requires a
// nonzero applied index in the state machine, but there is no state machine
// state. This is an intermediate state entered when transitioning from
// InitializedStateMachine to DeletedReplica, after the state machine state
// has been deleted and RangeTombstoneKey updated and before the raft state
// has been deleted. This is distinguishable from UninitializedStateMachine
// since RaftTruncatedState.{Index,Term} are guaranteed to exist and have
// values >= RaftInitialLogIndex, RaftInitialLogTerm. ReplicasStorage.Init
// will transition out of this state into DeletedReplica state.
// The RecoveryDeletingReplica can also occur when removing a replica in state
// UninitializedStateMachine. This is because the RangeTombstone is written
// first to the state machine, and subsequently the raft state is removed.
// This corresponds to the condition RangeTombstone.NextReplicaID >
// RaftReplicaID.ReplicaID.
//
// DEFINITION (RecoveryInconsistentReplica): This is a Replica that mostly
// looks like to be in state InitializedStateMachine, but has suffered
// regression in durable state such that the state machine has advanced past
// HardState.Commit, or a snapshot has been applied and all raft log entries
// are < RaftAppliedIndex, i.e., it violates RaftAndStateConsistency
// invariants. More severely, it can also violate StateConsistency invariants
// by having durably ingested SSTables but not yet updated the
// RaftAppliedIndex to reflect that state machine change. ReplicasStorage.Init
// restores all the invariants needed by an InitializedStateMachine replica,
// by fixing the raft log to be consistent with the state machine, and
// re-applying log entries up to HardState.Commit (except for log entries that
// indicate a split or merge -- see below).
//
// Replica state transitions:
// - Initial state: UninitializedStateMachine
// - Final state: DeletedReplica
// - UninitializedStateMachine => RecoveryDeletingReplica, InitializedStateMachine
// - InitializedStateMachine => RecoveryDeletingReplica, RecoveryInconsistentReplica
// - RecoveryDeletingReplica => DeletedReplica
// - RecoveryInconsistentReplica => InitializedStateMachine
//
// ============================================================================
// Implementation constraints on ReplicasStorage:
// - Splits and Merges typically happen by applying an entry in the raft log.
//   It is feasible for ReplicasStorage.Init to apply such committed entries.
//   However, the logic in such cases can add additional mutations to the
//   batch in the raft log, that have nothing to do with the normal scope of
//   what ReplicasStorage is concerned with. For example, splitPreApply has
//   logic to set RangeAppliedState.RaftClosedTimestamp. For this reason
//   ReplicasStorage ensures durability of split/merge application and does
//   not apply any Split/Merge log entries in ReplicasStorage.Init.
//
// ============================================================================
// Replica Initialization:
//
// Since ReplicasStorage does not permit atomic updates spanning the state
// machine and raft state (even if they are a single engine), replica creation
// is sequenced as (* indicates durable writes):
//
// - [C1*] creation of RaftHardStateKey in raft state with
//   {Term:0,Vote:0,Commit:0}. This is a replica in UninitializedStateMachine
//   state.
// - [C2*] creation of state machine state (via snapshot or some synthesized
//   state for range-ID and range local keys in the case of split).
// - [C3] set RaftTruncatedStateKey with RaftTruncatedState.{Index,Term} equal
//   to RangeAppliedState.{RaftAppliedIndex,RaftAppliedIndexTerm} and adjust
//   value of RaftHardStateKey (specifically HardState.Commit needs to be set
//   to RangeAppliedState.RaftAppliedIndex -- see below for details). Also
//   discard all raft log entries if any (see below). At this point the
//   replica is in InitializedStateMachine state.
//
// Every step above needs to be atomic. Note that we are doing 2 syncs, in
// steps C1 and C2, for the split case, where we currently do 1 sync -- splits
// are not common enough for this to matter. If we did not sync C2 we could
// start adding log entries after C3 but lose the state machine state in the
// case of a crash, which would violate the replica state invariants.
//
// An initialized replica that receives a snapshot because it has lagged
// behind will execute C2 and C3. The C3 step throws away all the existing
// raft log entries. So a precondition for applying such a snapshot is:
// - The raft log does not have entries beyond the snapshot's
//   RangeAppliedState.RaftAppliedIndex. If it did, there would be no benefit
//   in applying this snapshot.
//   The following etcd/raft code
//   https://github.com/etcd-io/etcd/blob/7572a61a39d4eaad596ab8d9364f7df9a84ff4a3/raft/raft.go#L1584-L1589
//   ensures this behavior -- if the raft log entry corresponding to the
//   snapshot is already present locally, it only advances the commit index to
//   the snapshot index, and does not actually apply the snapshot.
// - Corollary: since HardState.Commit cannot refer to log entries beyond the
//   locally persisted ones, the existing HardState.Commit <=
//   RangeAppliedState.RaftAppliedIndex, so the HardState manipulation done in
//   step C3 will only need to increase the value of HardState.Commit.
//
// Why C2 before C3?:
// If we performed step C3 before C2, there is a possibility that a crash
// prevents C2. Now we would need to rollback the change made in C3 to reach a
// fully consistent state on crash recovery. Rolling back HardState.Commit is
// easy, since there is no raft log, we can set it to
// RangeAppliedState.RaftAppliedIndex if it exists, else 0. Similarly, we can
// rollback RaftTruncatedState by either:
// - deleting it if the RangeAppliedState does not exist, which implies C3 did
//   not happen.
// - if RangeAppliedState exists, roll back RaftTruncatedState.{Index,Term} to
//   RangeAppliedState.{RaftAppliedIndex,RaftAppliedIndexTerm}. Note that this
//   is a case where an already initialized lagging replica has a snapshot
//   being applied.
// The correctness problem with doing C3 before C2 is that the store violates
// raft promises it has made earlier. For example, say the state machine had
// applied index 20 and the raft log contained [15, 25), then this store is
// part of the quorum that causes [21, 25) to commit. We receive a snapshot
// for 30, and crash after C3, and since C3 is before C2 in this scenario, we
// rollback to 20 and have no raft state. Therefore, this is in effect an
// unavailable replica, since it no longer has [21, 25).
//
// Rolling forward if crash after C2 and before C3:
// ReplicasStorage.Init will roll forward to C3 when initializing itself.
// - If HardState.Commit < RangeAppliedState.RaftAppliedIndex, update
//   HardState.Commit
// - If RaftTruncatedState does not exist, or RaftTruncatedState.Index <
//   RangeAppliedState.RaftAppliedIndex and all log entries are <=
//   RangeAppliedState.RaftAppliedIndex
//   - Discard all raft log entries.
//   - Set RaftTruncatedState.{Index,Term} using
//     RangeAppliedState.{RaftAppliedIndex,RaftAppliedIndexTerm}
//
// Aside:
// Since we now have RangeAppliesState.RaftAppliedIndexTerm, constructing an
// outgoing snapshot only involves reading state machine state (this is a tiny
// bit related to #72222, in that we are also assuming here that the outgoing
// snapshot is constructed purely by reading state machine engine state).
// TODO(sumeer): we can do this on master since RaftAppliedIndexTerm was
// introduced in 22.1.
//
// ============================================================================
// Replica Deletion:
//
// Replica deletion is sequenced as the following steps (* indicates durable
// writes):
//
// - [D1*] deletion of state machine state (iff the replica is in state
//   InitializedStateMachine) and write to the RangeTombstoneKey. If prior to
//   this step the range was in state InitializedStateMachine, it is now in
//   state RecoveryDeletingReplica. If it was in state UninitializedStateMachine,
//   again it is now in the state RecoveryDeletingReplica.
//   This latter case can occur for various reasons: one cause is this range
//   is the RHS of a split where the split has not yet happened, but we've
//   created an uninitialized RHS. So we can't delete the state machine state
//   for the RHS since it doesn't exist yet (there is some replicated state in
//   the state machine that could in the future belong to the RHS, but not
//   yet, and we don't know the span of that future RHS either). By updating
//   the RangeTombstone, when the split occurs, D1 will be repeated.
// - [D2] deletion of all Raft state for this RangeID, i.e., RaftHardStateKey,
//   RaftTruncatedStateKey, log entries, RangeLastReplicaGCTimestampKey.
//
// Every step above needs to be atomic. One of the reasons to sync after D1 it
// that we could later execute C1 when adding the range back to this store, and
// then crash. On crash recovery we'd find the raft HardState and old state
// machine state and incorrectly think this is an initialized replica.
//
// The merge operation alters step D1 to only write the RangeTombstoneKey and
// delete the range-id local keys in the RHS.
//
// Note that we don't delete the RangeTombstoneKey even when the range itself
// is being deleted (due to a merge). The replay protection offered by it is
// more important than the minuscule cost of leaking a RangeTombstoneKey per
// range. It is possible to have some cleanup of RangeTombstoneKeys for long
// dead ranges, but it is outside of the scope here.
//
// A crash after D1 will result in a replica in state RecoveryDeletingReplica.
// ReplicasStorage.Init will execute D2. See also
// https://github.com/cockroachdb/cockroach/issues/73424 which presumably
// deals with cleaning up UninitializedStateMachine replicas in the absence of
// a crash.
//
// ============================================================================
// Normal Replica Operation:
//
// - ReplicasStorage is used to append/replace log entries and update
//   HardState. This is done via a RaftMutationBatch. There is a
//   RaftMutationBatch.MustSync that the caller uses to specify the minimal
//   sync requirements imposed by Raft (ReplicasStorage is not in the business
//   of understanding Raft correctness requirements). Typically MustSync will
//   be true only if entries are appended, or a vote and/or term change has to
//   be recorded. In particular, a change solely to HardState.Commit would
//   have MustSync=false. See
//   https://github.com/etcd-io/etcd/blob/7572a61a39d4eaad596ab8d9364f7df9a84ff4a3/raft/node.go#L584-L593.
//   Note that this means that HardState.Commit can regress and become less
//   than RangeAppliedState.RaftAppliedIndex in case of a crash. We will fix
//   this in ReplicasStorage.Init, as discussed later.
//
// - The caller keeps track of HardState.Commit, since it constructed
//   HardState for the RaftMutationBatch. It applies committed entries to the
//   state machine using ApplyCommittedUsingIngest and ApplyCommittedBatch.
//   The ApplyCommitted* methods should not be used for log entries that are
//   performing splits or merges -- the caller should do those by calling
//   SplitReplica and MergeReplicas. ReplicasStorage decides when it is
//   necessary to sync -- ApplyCommitted* will not sync the state machine, and
//   SplitReplica/MergeReplicas will sync the state machine. Note that the
//   caller may not need to read a raft entry from ReplicasStorage in order to
//   apply it, if it happens to have stashed it somewhere in its in-memory
//   data-structures.
//
//   For log entries that are ingesting side-loaded files, the application of
//   a single entry is split into a pair, ApplyCommittedUsingIngest, that
//   usually does not update the RaftAppliedIndex and then ApplyCommittedBatch
//   which updates the RaftAppliedIndex. A crash after the first and before
//   the second leaves the state machine in an inconsistent state
//   (RecoveryInconsistentReplica) which needs to be fixed by
//   ReplicasStorage.Init. For this reason, ReplicasStorage keeps track of the
//   highest HardState.Commit known to be durable, and requires
//   ApplyCommittedUsingIngest to provide the highestRaftIndex of the changes
//   included in the files. ReplicasStorage will sync the raft state if
//   needed, to make the highestRaftIndex durable, before ingesting these
//   files. This prevents regression of HardState.Commit past an index that
//   contains side-loaded files. Note that this assumes that
//   ReplicasStorage.Init has the capability of applying all raft log entries
//   except for splits and merges (we've already mentioned that splits/merges
//   are made durable at application time).
//
// - Log truncation is done by the caller, based on various signals relevant
//   to the proper functioning of the distributed raft group. The truncation
//   is notified via RangeStorage.TruncatedRaftLog. This is breaking the
//   abstraction, in that the caller has changed the raft log and removed
//   sideloaded entries without going through RangeStorage, so we should see
//   if we can do better here. The current log truncation methods are:
//   - For strongly-coupled truncation (with a single engine): the truncation
//     happens when applying to the state machine (ApplyCommittedBatch) and we
//     don't want to leak this information via the MutationBatch that is
//     supposed to only touch the state machine. See more details below.
//   - For loosely-coupled truncation (single engine or separate engines): the
//     truncation happens in raftLogTruncator.tryEnactTruncations which is
//     mutating only the raft log. For this case we could keep all the
//     business logic related to deciding what to truncate (which requires
//     interaction with the Replica object) in raftLogTruncator, while giving
//     RangeStorage the actual batch (with additional information on what is
//     being truncated) to commit.
//   As of https://github.com/cockroachdb/cockroach/pull/80193 the
//   loosely-coupled raft log truncation is disabled due to a performance
//   regression in write-heavy workloads (see comment
//   https://github.com/cockroachdb/cockroach/issues/78412#issuecomment-1119922463
//   for conclusion of investigation).
//
//   TODO(sumeer): the following "revised plan" comment is from Sep 2022 and
//   probably stale.
//
//   The revised plan is to
//   - Do strongly-coupled truncation in
//     CanTruncateRaftIfStateMachineIsDurable for a ReplicasStorage
//     implementation that shares the same engine for the state machine and
//     raft state. This relies on external code structure for correctness: the
//     truncation proposal flows through raft, so we have already applied the
//     state machine changes for the preceding entries. A crash will cause a
//     suffix of the unsynced changes to be lost, so we cannot lose the state
//     machine changes while not losing the truncation.
//
//     This is the similar to the correctness argument that the code preceding
//     ReplicasStorage relies on. The separation of the RaftMutationBatch
//     provided to DoRaftMutation and the MutationBatch provided to
//     ApplyCommittedBatch is only more formalization of the separation that
//     already exists: handleRaftReadyRaftMuLocked makes raft changes with one
//     batch, and replicaAppBatch.ApplyToStateMachine is used to make changes
//     to the state machine with another batch.
//     replicaAppBatch.ApplyToStateMachine also does the raft log truncation,
//     and raft changes for splits and merges, which ReplicasStorage is doing
//     in a different way, but it does not change the correctness claim. Note
//     that #38566, a flaw in this correctness argument, has since been fixed.
//
//   - Do loosely-coupled truncation for a ReplicasStorage implementation that
//     has different engines for the state machine and raft state. The
//     experiments in
//     https://github.com/cockroachdb/cockroach/issues/16624#issuecomment-1137394935
//     have demonstrated that we do not have a performance regression. We
//     speculate that the absence of performance regression is due to:
//     - With multiple key-value pairs in a batch, the memtable for the raft
//       engine will be able to store more than the corresponding state
//       machine memtable where the key-value pairs get individual entries in
//       the memtable. This is because of the per-entry overhead. This means
//       there is a decent probability that the state machine memtable will
//       start getting flushed before the corresponding raft engine memtable
//       is flushed. If the flush is fast enough, we would be able to truncate
//       the raft log before the raft log entries are flushed.
//     - The smaller raft engine will have a higher likelihood that deletes
//       due to truncation get flushed to L0 while the log entry being deleted
//       is also in L0. This should reduce the likelihood of wasteful
//       compaction of raft log entries to lower levels.
//
// - Range merges impose an additional requirement: the merge protocol (at a
//   higher layer) needs the RHS replica of a merge to have applied all raft
//   entries up to a specified index and that this application is durable. To
//   ensure the durability we expose a SyncStateMachine method for the higher
//   layer.
//
// ============================================================================
// Crash Recovery:
// ReplicasStorage needs to be self contained in the sense that it must be
// able to execute state changes to reach a fully consistent state without
// needing any external input, as part of its initialization. Init will block
// until all the raft and state machine states have been made mutually
// consistent.
// - Iterate over RaftHardStateKeys and identify a set of replicas R_r. This
//   is efficiently done by seeking to the current RangeID+1.
// - Iterate over RangeDescriptorKeys and identify a set of replicas R_s. This
//   is efficiently done by using the latest non-provisional RangeDescriptor
//   (replica-descriptor) of the current range and then seeking to the end key
//   of the range's span.
//   - Note that this way of skipping spans will ensure that we will not find
//     RangeDescriptors that have overlapping spans, which is ideally an
//     invariant we should verify. Instead of verifying that invariant, which
//     is expensive, we additionally iterate over all the
//     RangeAppliedStateKeys, which are Range-ID local keys -- this iteration
//     can be accomplished by seeking using current RangeID+1. If we find
//     RangeAppliedStateKeys whose RangeID is not mentioned in a corresponding
//     RangeDescriptor we have an invariant violation.
// - The set R_s - R_r must be empty, i.e., R_s is a subset of R_r.
// - The set R_r - R_s are replicas are either in state
//   UninitializedStateMachine or RecoveryDeletingReplica.
// - Remove RecoveryDeletingReplica replicas by transitioning them to DeletedReplica
//   by executing D2.
// - The set R_s are replicas that ought to be in state
//   InitializedStateMachine, though may have been in the middle of that state
//   transition, or become inconsistent for other reasons mentioned earlier.
//   That is, they are potentially in RecoveryInconsistentReplica state.
//   - If RangeAppliedState.RaftAppliedIndex > HardState.Commit (NB: HardState
//     must exist), execute the following atomically:
//     - If there are no log entries or all log entries are <
//       RaftAppliedIndex: remove all log entries and set
//       RaftTruncatedState.{Index,Term} equal to
//       RangeAppliedState.{RaftAppliedIndex,RaftAppliedIndexTerm}.
//     - Set HardState.Commit to RaftAppliedIndex.
//     These steps handle (a) crash in the middle of replica creation (doing
//     step C3), and (b) regression of HardState.Commit under normal replica
//     operation. The RaftAndStateConsistency invariant now holds.
//   - The StateConsistency invariant may not hold. To ensure that it holds:
//     for ranges whose RangeAppliedState.RaftAppliedIndex < HardState.Commit,
//     apply log entries, including those that remove this replica, until one
//     encounters a log entry that is performing a split or merge.
// - InitializedStateMachine replicas:
//   - using the replica-descriptors, check that the spans do not overlap.
//   - This InterReplicaStateConsistency invariant must also hold before we
//     fixed the RecoveryInconsistentReplicas, so we could additionally check
//     it then.
// ============================================================================

// ReplicaState represents the current state of a range replica in this store.
type ReplicaState int

const (
        // UninitializedStateMachine is a replica with raft state but no state
        // machine.
        UninitializedStateMachine ReplicaState = iota
        // InitializedStateMachine is a replica with both raft state and state
        // machine.
        InitializedStateMachine
        // DeletedReplica is a replica with neither raft state or state machine.
        DeletedReplica
)

// FullReplicaID is a fully-qualified replica ID.
type FullReplicaID struct {
        // RangeID is the id of the range.
        RangeID roachpb.RangeID
        // ReplicaID is the id of the replica.
        ReplicaID roachpb.ReplicaID
}

// SafeFormat implements redact.SafeFormatter. It prints as
// r<rangeID>/<replicaID>.
func (id FullReplicaID) SafeFormat(s interfaces.SafePrinter, _ rune) {
        s.Printf("r%d/%d", id.RangeID, id.ReplicaID)
}

// String formats a store for debug output.
func (id FullReplicaID) String() string {
        return redact.StringWithoutMarkers(id)
}

// ReplicaInfo provides the replica ID and state pair.
type ReplicaInfo struct {
        FullReplicaID
        // State of the replica.
        State ReplicaState
}

// MutationBatch can be committed to the underlying engine. Additionally, it
// provides access to the underlying Batch. In some cases the implementation
// of ReplicasStorage will add additional mutations before committing. We
// expect the caller to know which engine to construct a batch from, in order
// to update the state machine or the raft state. ReplicasStorage does not
// hide such details since we expect the caller to mostly do reads using the
// engine Reader interface.
//
// TODO(sumeer): there are some operations that need to use the storage.Batch
// as a Reader. Be clear on when a storage.Batch can be read from, and whether
// it includes the changes in the batch or is reading from the underlying DB
// (see the code in pebbleBatch that selects between the two). Ideally, the
// supported semantics should be specified via an interface method on
// storage.Batch, and ReplicasStorage can check that the semantics are what it
// expects.
type MutationBatch interface {
        // Commit writes the mutation to the engine.
        Commit(sync bool) error
        // Batch returns the underlying storage.Batch.
        Batch() Batch
}

// RaftMutationBatch specifies mutations to the raft log entries and/or
// HardState.
type RaftMutationBatch struct {
        // MutationBatch should be created using NewUnindexedBatch(false) so it
        // can be read from, but the reads will read the underlying engine. It is
        // important for the reads to not see the state in the batch, since reading
        // the underlying engine is used to clear stale raft log entries that are
        // getting overwritten by this batch.
        MutationBatch
        // [Lo, Hi) represents the raft log entries, if any in the MutationBatch.
        // This is appending/overwriting entries in the raft log. That is, if the
        // log is [a,b,c,d], with a at index 12 and one appends e at index 13, the
        // result will be [a,e]. Note that the MutationBatch only contains the write
        // at index 13, and the removal of indices 14, 15 is done by the callee. The
        // callee assumes that the entries to remove are from Term-1. We assume the
        // caller is upholding Raft semantics (such as not overwriting raft log
        // entries that have been committed) -- ReplicasStorage is not in the
        // business of validating that such semantics are being upheld.
        Lo, Hi uint64
        // Term represents the term of those entries.
        Term uint64
        // HardState, if non-nil, specifies the HardState value being set by
        // MutationBatch.
        HardState *raftpb.HardState
        // MustSync is set to true if the mutation must be synced.
        MustSync bool
}

// RangeStorage is a handle for a FullReplicaID that provides the ability to
// write to the raft state and state machine state. This could have been named
// ReplicaStorage, but that sounds too similar to ReplicasStorage. Note that,
// even though a caller can have two different RangeStorage handles for the
// same range, if the range has been added and removed and so has different
// ReplicaIDs, at most one of them will be in state != DeletedReplica.
//
// Other than the FullReplicaID() method, the methods access mutable state,
// and so may not execute concurrently.
type RangeStorage interface {
        // FullReplicaID returns the FullReplicaID of this replica.
        FullReplicaID() FullReplicaID
        // State returns the ReplicaState of this replica.
        State() ReplicaState

        // CurrentRaftEntriesRange returns [lo, hi) representing the locally stored
        // raft entries. These are guaranteed to be locally durable.
        CurrentRaftEntriesRange(ctx context.Context) (lo uint64, hi uint64, err error)

        // GetHardState returns the current HardState. HardState.Commit is not
        // guaranteed to be durable.
        GetHardState(ctx context.Context) (raftpb.HardState, error)

        // TruncatedRaftLog provides the inclusive index up to which truncation has
        // been done.
        TruncatedRaftLog(index uint64)

        // DoRaftMutation changes the raft state. This will also purge sideloaded
        // files if any entries are being removed.
        // REQUIRES: if rBatch.Lo < rBatch.Hi, the range is in state
        // InitializedStateMachine.
        DoRaftMutation(ctx context.Context, rBatch RaftMutationBatch) error

        // TODO(sumeer):
        // - add raft log read methods.
        // - what raft log stats do we need to maintain and expose (raftLogSize?)?
        //   We could accept a callback with a truncated index parameter that
        //   RangeStorage invokes whenever it truncates entries, and let the caller
        //   maintain the size.

        // State machine commands.

        // IngestRangeSnapshot ingests a snapshot for the range.
        // - The replica-descriptor in the snapshot describes the range as equal to
        //   span.
        // - The snapshot corresponds to application of the log up to
        //   raftAppliedIndex.
        // - sstPaths represent the ssts for this snapshot, and do not include
        //   anything other than state machine state and do not contain any keys
        //   outside span (after accounting for range and replicated range-ID local
        //   keys), or Range-ID keys for other ranges.
        // - sstPaths include a RANGEDEL that will clear all the existing state
        //   machine state in the store for span (including range-id and range local
        //   keys) "before" adding the snapshot state (see below for additional
        //   RANGEDELs that may be added by ReplicasStorage if the previous span for
        //   this replica was wider).
        // NB: the ssts contain RangeAppliedState, RangeDescriptor (including
        // possibly a provisional RangeDescriptor). Ingestion is the only way to
        // initialize a range except for the RHS of a split.
        //
        // Snapshot ingestion will not be accepted in the following cases:
        // - span overlaps with the (span in the) replica-descriptor of another
        //   range, unless the range is listed in subsumedRangeIDs. The ranges
        //   listed in subsumedRangeIDs must have spans that at least partially
        //   overlap with span.
        //   TODO(sumeer): copy the partial overlap example documented in
        //   replica_raftstorage.go clearSubsumedReplicaDiskData.
        //   The span of a range can change only via IngestRangeSnapshot,
        //   SplitReplica, MergeRange, so ReplicasStorage can keep track of all
        //   spans without resorting to repeated reading from the engine.
        // - the raft log already has entries beyond the snapshot (this is an
        //   invariant that Raft is already expected to maintain, so it is not
        //   an expected error).
        //
        // For reference, ReplicasStorage will do:
        // - If this replica is already initialized compute
        //   rhsSpan = current span - span provided in this call.
        //   rhsSpan is non-empty if we are moving the LHS past a split using a
        //   snapshot. In this case any replica(s) corresponding to rhsSpan cannot
        //   possibly be in InitializedStateMachine state (since it would be a
        //   violation of spans being non-overlapping). That is, they may be
        //   - participating in the raft group(s) for the RHS, but will not have any
        //     log entries.
        //   - rebalanced away.
        //   In either case, it is safe to clear all range local and global keys for
        //   the rhsSpan. ssts will be added to clear this state. Note, that those
        //   uninitialized ranges cannot have any replicated range-ID local keys.
        //   They may have a RangeTombstoneKey, but it is not something this method
        //   needs to touch.
        // - Add additional ssts that clear the replicated Range-ID keys for the
        //   subsumed ranges, set the RangeTombstone to mergedTombstoneReplicaID,
        //   and the non-overlapping replicated range key spans for the subsumed
        //   range.
        // - Atomically ingest the ssts. This does step C2 for this range and D1 for
        //   all subsumed ranges. This is durable. A crash after this step and
        //   before the next step is rolled forward in ReplicasStorage.Init.
        // - Do steps C3 for this range and steps D2 for the subsumed ranges.
        //
        // In handleRaftReadyRaftMuLocked, if there is a snapshot, it will first
        // call IngestRangeSnapshot, and then DoRaftMutation to change the
        // HardState.{Term,Vote,Commit}. Note that the etcd/raft logic fast forwards
        // the HardState.Commit to the snapshot index, so the DoRaftMutation will
        // not actually change the stored value of HardState.Commit from what was
        // already set in IngestRangeSnapshot.
        IngestRangeSnapshot(
                ctx context.Context, span roachpb.RSpan, raftAppliedIndex uint64,
                sstPaths []string, subsumedRangeIDs []roachpb.RangeID,
                sstScratch struct{} /* TODO(sumeer): kvserver.SSTSnapshotStorageScratch */) error

        // ApplyCommittedUsingIngest applies committed changes to the state machine
        // state by ingesting sstPaths. highestRaftIndex is the highest index whose
        // changes are included in the sstPaths. This is due to "sideloaded sst"
        // raft log entries. These ssts do not contain an update to
        // RangeAppliedState, so this call must be immediately followed by a call to
        // ApplyCommittedBatch that does update the RangeAppliedState.
        // It is possible for the node containing this store to crash prior to that
        // call to ApplyCommittedBatch -- this is ok since ReplicasStorage.Init will
        // replay this idempotent ingest and the following ApplyCommittedBatch.
        // REQUIRES: replica is in state InitializedStateMachine.
        ApplyCommittedUsingIngest(
                ctx context.Context, sstPaths []string, highestRaftIndex uint64) error

        // ApplyCommittedBatch applies committed changes to the state machine state.
        // Does not sync. Do not use this for applying raft log entries that perform
        // split, merge, or remove this replica (due to rebalancing) -- see the
        // methods in ReplicasStorage that accomplish that.
        // REQUIRES: replica is in state InitializedStateMachine (this is because we
        // create a new range with the first log entry at RaftInitialLogIndex (10),
        // so a range always requires an initial state "snapshot" before it can
        // apply raft entries).
        ApplyCommittedBatch(smBatch MutationBatch) error

        // SyncStateMachine is for use by higher-level code that needs to ensure
        // durability of the RHS of a merge. It simply syncs the state machine state
        // to ensure all previous mutations are durable.
        // REQUIRES: replica is in state InitializedStateMachine.
        SyncStateMachine(ctx context.Context) error
}

// ReplicasStorage provides an interface to manage the persistent state of a
// store that includes the lifecycle of a range replica, its raft log, and the
// state machine state. See the comment at the top of the file.
type ReplicasStorage interface {
        // Init will block until all the raft and state machine states have been
        // made consistent.
        Init(ctx context.Context) error

        // CurrentRanges returns the replicas in the store. It does not return any
        // ranges with state DeletedReplica, since it has no knowledge of them.
        CurrentRanges() []ReplicaInfo

        // GetRangeTombstone returns the nextReplicaID in the range tombstone for
        // the range, if any.
        GetRangeTombstone(
                ctx context.Context, rangeID roachpb.RangeID) (nextReplicaID roachpb.ReplicaID, err error)

        // GetHandle returns a handle for a range listed in CurrentRanges().
        // ReplicasStorage will return the same handle object for a FullReplicaID
        // during its lifetime. Once the FullReplicaID transitions to DeletedReplica
        // state, ReplicasStorage will forget the RangeStorage handle and it is up
        // to the caller to decide when to throw away a handle it may be holding
        // (the handle is not really usable for doing anything once the range is
        // deleted).
        GetHandle(rr FullReplicaID) (RangeStorage, error)

        // CreateUninitializedRange is used when rebalancing is used to add a range
        // to this store, or a peer informs this store that it has a replica of a
        // range. This is the first step in creating a raft group for this
        // FullReplicaID. It will return an error if:
        // - This ReplicaID is too old based on the RangeTombstone.NextReplicaID
        // - There already exists some state under any raft key for this range.
        //
        // The call will cause HardState to be initialized to
        // {Term:0,Vote:0,Commit:0}.
        //
        // Typically there will be no state machine state for this range. However it
        // is possible that a split is delayed and some other store has informed this
        // store about the RHS of the split, in which case part of the state machine
        // (except for the Range-ID keys, RangeDescriptor) already exist. Note that
        // this locally lagging split case is one where the RHS does not transition
        // to initialized via anything other than a call to SplitReplica (i.e., does
        // not apply a snapshot), except when the LHS moves past the split using a
        // snapshot, in which case the RHS can also then apply a snapshot.
        CreateUninitializedRange(ctx context.Context, rr FullReplicaID) (RangeStorage, error)

        // SplitReplica is called to split range r into a LHS and RHS, where the RHS
        // is represented by rhsRR. The smBatch specifies the state machine state to
        // modify for the LHS and RHS. For the RHS, the smBatch must be constructing
        // the appropriate range-ID local state and range local state that doesn't
        // already exist in the store (including the RangeDescriptor). rhsSpan is
        // the span in the RangeDescriptor for the RHS. The following cases can
        // occur:
        //
        // - [A1] RangeTombstone for the RHS indicates rhsRR.ReplicaID has already
        //   been removed. Two subcases:
        //   - [A11] There exists a HardState for rhsRR.RangeID: the range has been
        //     added back with a new ReplicaID.
        //   - [A12] There exists no HardState, so rhsRR.RangeID should not exist on
        //     this store.
        // - [A2] RangeTombstone for the RHS indicates that rhsRR.ReplicaID has not
        //   been removed.
        //
        // For A11 and A12, the smBatch will be altered to clear all state in the
        // state machine for the RHS. The final state RHS will be in for A11 is
        // UninitializedStateMachine, for A12 is DeletedReplica. For A2, the smBatch
        // is not altered and the final RHS state is InitializedStateMachine. If the
        // final RHS state is DeletedReplica, a nil RangeStorage will be returned.
        // The application of smBatch is synced.
        //
        // From our earlier discussion of replica creation and deletion.
        // - For case A2, the callee will perform step C1 if needed, then commit
        //   smBatch (step C2), and then perform step C3.
        // - For case A11 there is no need to do step C1. Steps C2 and C3 cannot be
        //   performed since the RHS ReplicaID has changed and the state here is
        //   stale. All we are doing is cleaning up the state machine state for the
        //   RHS when committing smBatch. The callee is doing step D1 of deletion,
        //   of the RHS for the old replicaID.
        // - For case A12, the callee is doing step D1 of deletion, by altering and
        //   committing smBatch. Since the RHS range never transitioned to
        //   initialized (it never had a RangeDescriptor), the deletion was unable
        //   to execute D1 when the HardState etc. was being deleted (it only
        //   executed D2). The RHS will continue to be in DeletedReplica state when
        //   the method returns.
        //
        // REQUIRES: The range being split is in state InitializedStateMachine, and
        // RHS either does not exist or is in state UninitializedStateMachine.
        //
        // Called below Raft -- this is being called when the split transaction commits.
        SplitReplica(
                ctx context.Context, r RangeStorage, rhsRR FullReplicaID, rhsSpan roachpb.RSpan,
                smBatch MutationBatch,
        ) (RangeStorage, error)

        // MergeReplicas is called to merge two range replicas. smBatch contains
        // changes to the LHS state machine to incorporate the state of the RHS, and
        // the intent resolution of the RHS RangeDescriptor.
        //
        // It will perform the following steps:
        // - Alter smBatch to remove all Range-ID local keys in the RHS and write the
        //   RangeTombstone to the RHS with value mergedTombstoneReplicaID.
        //
        // - Apply and sync smBatch, which transforms the LHS into the merged range,
        //   and performs step D1 for the RHS. The sync ensures that a crash after
        //   this step and before the next step will be rolled forward in Init.
        //
        // - Do step D2 for the RHS.
        //
        // REQUIRES: LHS and RHS are in state InitializedStateMachine, and RHS has
        // durably applied all commands up to the merge.
        //
        // Code above this layer ensures the above durability of application of all
        // commands in the RHS and additionally ensures that the RHS of a merge is
        // immovable once in the critical phase (i.e. past the SubsumeRequest is
        // handled), until the merge txn aborts (if it ever does). On the
        // leaseholder handling Subsume, this is done by the Subsume. But we also
        // prevent all future leaseholders from doing anything that would violate
        // the critical phase by observing the deletion intent on the range
        // descriptor. If a merge transaction commits, regardless of which replicas
        // know about this yet, the LHS and RHS will be fully colocated.
        //
        // Called below Raft -- this is being called when the merge transaction commits.
        MergeReplicas(
                ctx context.Context, lhsRS RangeStorage, rhsRS RangeStorage, smBatch MutationBatch) error

        // DiscardReplica is called to discard a replica that has been rebalanced
        // away. The replica is either in UninitializedStateMachine or
        // InitializedStateMachine state. There are multiple reasons for this to be
        // called, such as the raft log entry that removes the replica is being
        // applied, or ReplicaGCQueue notices that the replica is too old. Due to
        // these multiple callers, ReplicasStorage is not in a position to compute
        // what the nextReplicaID for the RangeTombstone should be. Therefore, it
        // expects the caller to provide that value as a parameter.
        DiscardReplica(
                ctx context.Context, r RangeStorage, nextReplicaID roachpb.ReplicaID) error
}

type sideloadedStorageConstructor func(
        rangeID roachpb.RangeID, replicaID roachpb.ReplicaID) struct{} /* TODO(sumeer): SideloadStorage */

// MakeSingleEngineReplicasStorage constructs a ReplicasStorage where the same
// Engine contains the raft log and the state machine.
func MakeSingleEngineReplicasStorage(
        nodeID roachpb.NodeID,
        storeID roachpb.StoreID,
        eng Engine,
        ssConstructor sideloadedStorageConstructor,
        st *cluster.Settings,
) ReplicasStorage {
        // TODO(sumeer): implement
        return nil
}

// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvpb"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
)

// RowCounter is a helper that counts how many distinct rows appear in the KVs
// that is shown via `Count`. Note: the `DataSize` field of the BulkOpSummary
// is *not* populated by this and should be set separately.
type RowCounter struct {
        kvpb.BulkOpSummary
        prev roachpb.Key
}

// Count examines each key passed to it and increments the running count when it
// sees a key that belongs to a new row.
func (r *RowCounter) Count(key roachpb.Key) error {
        // EnsureSafeSplitKey is usually used to avoid splitting a row across ranges,
        // by returning the row's key prefix.
        // We reuse it here to count "rows" by counting when it changes.
        // Non-SQL keys are returned unchanged or may error -- we ignore them, since
        // non-SQL keys are obviously thus not SQL rows.
        //
        // TODO(ajwerner): provide a separate mechanism to determine whether the key
        // is a valid SQL key which explicitly indicates whether the key is valid as
        // a split key independent of an error. See #43423.
        row, err := keys.EnsureSafeSplitKey(key)
        if err != nil || len(key) == len(row) {
                // TODO(ajwerner): Determine which errors should be ignored and only
                // ignore those.
                return nil //nolint:returnerrcheck
        }

        // no change key prefix => no new row.
        if bytes.Equal(row, r.prev) {
                return nil
        }

        r.prev = append(r.prev[:0], row...)

        rem, _, err := keys.DecodeTenantPrefix(row)
        if err != nil {
                return err
        }
        _, tableID, indexID, err := keys.DecodeTableIDIndexID(rem)
        if err != nil {
                return err
        }

        if r.EntryCounts == nil {
                r.EntryCounts = make(map[uint64]int64)
        }
        r.EntryCounts[kvpb.BulkOpSummaryID(uint64(tableID), uint64(indexID))]++

        return nil
}

// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"
        "io"
        "sync/atomic"

        "github.com/cockroachdb/cockroach/pkg/cloud"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble/objstorage/remote"
)

// externalStorageReader implements remote.ObjectReader on top of
// cloud.ExternalStorage..
type externalStorageReader struct {
        // Store a reference to the parent Pebble instance. Metrics around remote
        // storage reads/writes are stored there.
        //
        // TODO(bilal): Refactor the metrics out of Pebble, and store a reference
        // to just the Metrics struct.
        p       *Pebble
        es      cloud.ExternalStorage
        objName string
}

var _ remote.ObjectReader = (*externalStorageReader)(nil)

func (r *externalStorageReader) ReadAt(ctx context.Context, p []byte, offset int64) error {
        reader, _, err := r.es.ReadFile(ctx, r.objName, cloud.ReadOptions{
                Offset:     offset,
                LengthHint: int64(len(p)),
                NoFileSize: true,
        })
        if err != nil {
                return err
        }
        defer reader.Close(ctx)
        for n := 0; n < len(p); {
                nn, err := reader.Read(ctx, p[n:])
                // The io.Reader interface allows for io.EOF to be returned even if we just
                // successfully filled the buffer p and hit the end of file at the same
                // time. Treat that case as a successful read.
                if err != nil && !(errors.Is(err, io.EOF) && len(p) == nn+n) {
                        return err
                }
                n += nn
        }
        atomic.AddInt64(&r.p.sharedBytesRead, int64(len(p)))
        return nil
}

// Close is part of the remote.ObjectReader interface.
func (e *externalStorageReader) Close() error {
        *e = externalStorageReader{}
        return nil
}

// externalStorageWriter wraps an io.WriteCloser returned by
// externalStorageWrapper and tracks metrics on bytes written to remote storage.
type externalStorageWriter struct {
        io.WriteCloser

        // Store a reference to the parent Pebble instance. Metrics around remote
        // storage reads/writes are stored there.
        //
        // TODO(bilal): Refactor the metrics out of Pebble, and store a reference
        // to just the Metrics struct.
        p *Pebble
}

var _ io.WriteCloser = &externalStorageWriter{}

// Write implements the io.Writer interface.
func (e *externalStorageWriter) Write(p []byte) (n int, err error) {
        n, err = e.WriteCloser.Write(p)
        atomic.AddInt64(&e.p.sharedBytesWritten, int64(n))
        return n, err
}

// externalStorageWrapper wraps a cloud.ExternalStorage and implements the
// remote.Storage interface expected by Pebble. Also ensures reads and writes
// to remote cloud storage are tracked in store-specific metrics.
type externalStorageWrapper struct {
        p   *Pebble
        es  cloud.ExternalStorage
        ctx context.Context
}

// MakeExternalStorageWrapper returns a remote.Storage implementation that wraps
// cloud.ExternalStorage.
func MakeExternalStorageWrapper(ctx context.Context, es cloud.ExternalStorage) remote.Storage {
        return &externalStorageWrapper{p: &Pebble{}, es: es, ctx: ctx}
}

var _ remote.Storage = &externalStorageWrapper{}

// Close implements the remote.Storage interface.
func (e *externalStorageWrapper) Close() error {
        return e.es.Close()
}

// ReadObject implements the remote.Storage interface.
func (e *externalStorageWrapper) ReadObject(
        ctx context.Context, objName string,
) (_ remote.ObjectReader, objSize int64, _ error) {
        objSize, err := e.es.Size(ctx, objName)
        if err != nil {
                return nil, 0, err
        }
        return &externalStorageReader{
                p:       e.p,
                es:      e.es,
                objName: objName,
        }, objSize, nil
}

// CreateObject implements the remote.Storage interface.
func (e *externalStorageWrapper) CreateObject(objName string) (io.WriteCloser, error) {
        writer, err := e.es.Writer(e.ctx, objName)
        return &externalStorageWriter{WriteCloser: writer, p: e.p}, err
}

// List implements the remote.Storage interface.
func (e *externalStorageWrapper) List(prefix, delimiter string) ([]string, error) {
        var directoryList []string
        err := e.es.List(e.ctx, prefix, delimiter, func(s string) error {
                directoryList = append(directoryList, s)
                return nil
        })
        if err != nil {
                return nil, err
        }
        return directoryList, nil
}

// Delete implements the remote.Storage interface.
func (e *externalStorageWrapper) Delete(objName string) error {
        return e.es.Delete(e.ctx, objName)
}

// Size implements the remote.Storage interface.
func (e *externalStorageWrapper) Size(objName string) (int64, error) {
        return e.es.Size(e.ctx, objName)
}

func (e *externalStorageWrapper) IsNotExistError(err error) bool {
        return errors.Is(err, cloud.ErrFileDoesNotExist)
}

// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

func nonZeroingMakeByteSlice(len int) []byte {
        ptr := mallocgc(uintptr(len), nil, false)
        return (*[MaxArrayLen]byte)(ptr)[:len:len]
}

// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"

        "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/kv/kvpb"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/metamorphic"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble/sstable"
        "github.com/cockroachdb/pebble/vfs"
        "github.com/cockroachdb/redact"
)

var (
        // DisableCheckSSTRangeKeyMasking forcibly disables CheckSSTConflicts range
        // key masking. This masking causes stats to be estimates, since we can't
        // adjust stats for point keys masked by range keys, but when we disable this
        // masking we expect accurate stats and can assert this in various tests
        // (notably kvnemesis).
        DisableCheckSSTRangeKeyMasking = metamorphic.ConstantWithTestBool(
                "disable-checksstconflicts-range-key-masking", false)
)

// NewSSTIterator returns an MVCCIterator for the provided "levels" of
// SST files. The SSTs are merged during iteration. Each subslice's sstables
// must have non-overlapping point keys, and be ordered by point key in
// ascending order. Range keys may overlap arbitrarily, including within a
// subarray. The outer slice of levels must be sorted in reverse chronological
// order: a key in a file in a level at a lower index will shadow the same key
// contained within a file in a level at a higher index.
func NewSSTIterator(files [][]sstable.ReadableFile, opts IterOptions) (MVCCIterator, error) {
        return newPebbleSSTIterator(files, opts)
}

// NewSSTEngineIterator is like NewSSTIterator, but returns an EngineIterator.
func NewSSTEngineIterator(
        files [][]sstable.ReadableFile, opts IterOptions,
) (EngineIterator, error) {
        return newPebbleSSTIterator(files, opts)
}

// NewMemSSTIterator returns an MVCCIterator for the provided SST data,
// similarly to NewSSTIterator().
func NewMemSSTIterator(sst []byte, verify bool, opts IterOptions) (MVCCIterator, error) {
        return NewMultiMemSSTIterator([][]byte{sst}, verify, opts)
}

// NewMultiMemSSTIterator returns an MVCCIterator for the provided SST data,
// similarly to NewSSTIterator().
func NewMultiMemSSTIterator(ssts [][]byte, verify bool, opts IterOptions) (MVCCIterator, error) {
        files := make([]sstable.ReadableFile, 0, len(ssts))
        for _, sst := range ssts {
                files = append(files, vfs.NewMemFile(sst))
        }
        iter, err := NewSSTIterator([][]sstable.ReadableFile{files}, opts)
        if err != nil {
                return nil, err
        }
        if verify {
                iter = newVerifyingMVCCIterator(iter.(*pebbleIterator))
        }
        return iter, nil
}

// CheckSSTConflicts iterates over an SST and a Reader in lockstep and errors
// out if it finds any conflicts. This includes intents and existing keys with a
// timestamp at or above the SST key timestamp.
//
// If disallowShadowingBelow is non-empty, it also errors for any existing live
// key at the SST key timestamp, but allows shadowing an existing key if its
// timestamp is above the given timestamp and the values are equal. See comment
// on AddSSTableRequest.DisallowShadowingBelow for details.
//
// sstTimestamp, if non-zero, represents the timestamp that all keys in the SST
// are expected to be at. This method can make performance optimizations with
// the expectation that no SST keys will be at any other timestamp. If the
// engine contains MVCC range keys in the ingested span then this will cause
// MVCC stats to be estimates since we can't adjust stats for masked points.
//
// The given SST and reader cannot contain intents, replicated locks, or inline
// values (i.e. zero timestamps). This is checked across the entire key span,
// from start to end.
//
// The returned MVCC statistics is a delta between the SST-only statistics and
// their effect when applied, which when added to the SST statistics will adjust
// them for existing keys and values.
func CheckSSTConflicts(
        ctx context.Context,
        sst []byte,
        reader Reader,
        start, end MVCCKey,
        leftPeekBound, rightPeekBound roachpb.Key,
        disallowShadowingBelow hlc.Timestamp,
        sstTimestamp hlc.Timestamp,
        maxLockConflicts, targetLockConflictBytes int64,
        usePrefixSeek bool,
) (enginepb.MVCCStats, error) {

        allowIdempotentHelper := func(_ hlc.Timestamp) bool { return false }
        if !disallowShadowingBelow.IsEmpty() {
                allowIdempotentHelper = func(extTimestamp hlc.Timestamp) bool {
                        return disallowShadowingBelow.LessEq(extTimestamp)
                }
        }
        if leftPeekBound == nil {
                leftPeekBound = keys.MinKey
        }
        if rightPeekBound == nil {
                rightPeekBound = keys.MaxKey
        }
        if DisableCheckSSTRangeKeyMasking {
                sstTimestamp = hlc.Timestamp{}
        }

        // In some iterations below, we try to call Next() instead of SeekGE() for a
        // few iterations, as nexts are more performant. If `numNextsBeforeSeek` nexts
        // are not sufficient to land at or after a desired SeekGE key, we fall back to
        // a seek.
        const numNextsBeforeSeek = 5
        var statsDiff enginepb.MVCCStats
        if usePrefixSeek {
                // If we're going to be using a prefix iterator, check for the fast path
                // first, where there are no keys in the reader between the sstable's start
                // and end keys. We use a non-prefix iterator for this search, and reopen a
                // prefix one if there are engine keys in the span.
                nonPrefixIter, err := reader.NewMVCCIterator(ctx, MVCCKeyAndIntentsIterKind, IterOptions{
                        KeyTypes:     IterKeyTypePointsAndRanges,
                        UpperBound:   end.Key,
                        ReadCategory: fs.BatchEvalReadCategory,
                })
                if err != nil {
                        return statsDiff, err
                }
                nonPrefixIter.SeekGE(start)
                valid, err := nonPrefixIter.Valid()
                nonPrefixIter.Close()
                if !valid {
                        return statsDiff, err
                }
        }

        // Check for any overlapping locks, and return them to be resolved.
        if locks, err := ScanLocks(
                ctx, reader, start.Key, end.Key, maxLockConflicts, targetLockConflictBytes); err != nil {
                return enginepb.MVCCStats{}, err
        } else if len(locks) > 0 {
                return enginepb.MVCCStats{}, &kvpb.LockConflictError{Locks: locks}
        }

        // Check for any range keys.
        //
        // TODO(bilal): Expose reader.Properties.NumRangeKeys() here, so we don't
        // need to read the SST to figure out if it has range keys.
        rkIter, err := NewMemSSTIterator(sst, false /* verify */, IterOptions{
                KeyTypes:   IterKeyTypeRangesOnly,
                LowerBound: keys.MinKey,
                UpperBound: keys.MaxKey,
        })
        if err != nil {
                rkIter.Close()
                return enginepb.MVCCStats{}, err
        }
        rkIter.SeekGE(NilKey)

        if ok, err := rkIter.Valid(); err != nil {
                rkIter.Close()
                return enginepb.MVCCStats{}, err
        } else if ok {
                // If the incoming SST contains range tombstones, we cannot use prefix
                // iteration.
                usePrefixSeek = false
        }
        rkIter.Close()

        rkIter, err = reader.NewMVCCIterator(ctx, MVCCKeyIterKind, IterOptions{
                UpperBound:   rightPeekBound,
                KeyTypes:     IterKeyTypeRangesOnly,
                ReadCategory: fs.BatchEvalReadCategory,
        })
        if err != nil {
                return enginepb.MVCCStats{}, err
        }
        rkIter.SeekGE(start)

        var engineHasRangeKeys bool
        if ok, err := rkIter.Valid(); err != nil {
                rkIter.Close()
                return enginepb.MVCCStats{}, err
        } else if ok {
                // If the engine contains range tombstones in this span, we cannot use prefix
                // iteration.
                usePrefixSeek = false
                engineHasRangeKeys = true
        }
        rkIter.Close()

        if usePrefixSeek {
                // Prefix iteration and range key masking don't work together. See the
                // comment on the panic inside pebbleIterator.setOptions.
                sstTimestamp = hlc.Timestamp{}
        }
        if engineHasRangeKeys && sstTimestamp.IsSet() {
                // If range key masking is requested and the engine contains range keys
                // then stats will be estimates. Range key masking is performant, but it
                // skips instances where we need to adjust GCBytesAge in the returned stats
                // diff. Consider an example where a point key is masked by a range
                // tombstone, and we added a new revision of that key above the range
                // tombstone in the SST. The GCBytesAge contribution of that range tombstone
                // on the point key's key (as opposed to the version contribution) needs to
                // be un-done as that key is now being used by the live key.
                //
                // TODO(bilal): Close this gap in GCBytesAge calculation, see:
                // https://github.com/cockroachdb/cockroach/issues/92254
                statsDiff.ContainsEstimates += 2
        }
        extIter, err := reader.NewMVCCIterator(ctx, MVCCKeyIterKind, IterOptions{
                KeyTypes:             IterKeyTypePointsAndRanges,
                LowerBound:           leftPeekBound,
                UpperBound:           rightPeekBound,
                RangeKeyMaskingBelow: sstTimestamp,
                Prefix:               usePrefixSeek,
                useL6Filters:         true,
                ReadCategory:         fs.BatchEvalReadCategory,
        })
        if err != nil {
                return enginepb.MVCCStats{}, err
        }
        defer extIter.Close()

        sstIter, err := NewMemSSTIterator(sst, false, IterOptions{
                KeyTypes:   IterKeyTypePointsAndRanges,
                UpperBound: end.Key,
        })
        if err != nil {
                return enginepb.MVCCStats{}, err
        }
        defer sstIter.Close()

        // compareForCollision returns an error if the sstKey collides with extKey.
        // It also adjusts statsDiff to account for the conflict if there's no error.
        // If there's an sst range key that covers extKey, the first version of it
        // above extKey must be passed into sstRangeKeyVersion, so that the deletion
        // is recorded at the correct timestamp in stats (i.e. for GCBytesAge). A
        // zero value for sstRangeKeyVersion is acceptable.
        compareForCollision := func(sstKey, extKey MVCCKey, sstValueRaw, extValueRaw []byte, sstRangeKeyVersion MVCCRangeKeyVersion) error {
                // Make sure both keys are proper committed MVCC keys. Note that this is
                // only checked when the key exists both in the SST and existing data, it is
                // not an exhaustive check of the SST.
                if !sstKey.IsValue() {
                        return errors.New("SST keys must have timestamps")
                }
                sstValueIsTombstone, err := EncodedMVCCValueIsTombstone(sstValueRaw)
                if err != nil {
                        return err
                }
                if !extKey.IsValue() {
                        var mvccMeta enginepb.MVCCMetadata
                        if err = extIter.ValueProto(&mvccMeta); err != nil {
                                return err
                        }
                        if len(mvccMeta.RawBytes) > 0 {
                                return errors.AssertionFailedf("inline values are unsupported")
                        } else if mvccMeta.Txn == nil {
                                return errors.AssertionFailedf("found intent without transaction")
                        } else {
                                return errors.AssertionFailedf("found intent after ScanLocks call")
                        }
                }
                extValueIsTombstone, err := EncodedMVCCValueIsTombstone(extValueRaw)
                if err != nil {
                        return err
                }

                // Allow certain idempotent writes where key/timestamp/value all match:
                //
                // * disallowShadowingBelow: any matching key at or above the given timestamp.
                allowIdempotent := !disallowShadowingBelow.IsEmpty() && disallowShadowingBelow.LessEq(extKey.Timestamp)
                if allowIdempotent && sstKey.Timestamp.Equal(extKey.Timestamp) &&
                        bytes.Equal(extValueRaw, sstValueRaw) {
                        // This SST entry will effectively be a noop, but its stats have already
                        // been accounted for resulting in double-counting. To address this we
                        // send back a stats diff for these existing KVs so that we can subtract
                        // them later. This enables us to construct accurate MVCCStats and
                        // prevents expensive recomputation in the future.
                        metaKeySize := int64(len(sstKey.Key) + 1)
                        metaValSize := int64(0)
                        totalBytes := metaKeySize + metaValSize

                        // Cancel the GCBytesAge contribution of the point tombstone (if any)
                        // that exists in the SST stats.
                        statsDiff.AgeTo(extKey.Timestamp.WallTime)
                        // Update the skipped stats to account for the skipped meta key.
                        if !sstValueIsTombstone {
                                statsDiff.LiveBytes -= totalBytes
                                statsDiff.LiveCount--
                        }
                        statsDiff.KeyBytes -= metaKeySize
                        statsDiff.ValBytes -= metaValSize
                        statsDiff.KeyCount--

                        // Update the stats to account for the skipped versioned key/value.
                        totalBytes = int64(len(sstValueRaw)) + MVCCVersionTimestampSize
                        if !sstValueIsTombstone {
                                statsDiff.LiveBytes -= totalBytes
                        }
                        statsDiff.KeyBytes -= MVCCVersionTimestampSize
                        statsDiff.ValBytes -= int64(len(sstValueRaw))
                        statsDiff.ValCount--

                        return nil
                }

                // If requested, check that we're not shadowing a live key. Note that
                // we check this before we check the timestamp, and avoid returning
                // a WriteTooOldError -- that error implies that the client should
                // retry at a higher timestamp, but we already know that such a retry
                // would fail (because it will shadow an existing key).
                if !extValueIsTombstone && !disallowShadowingBelow.IsEmpty() {
                        allowShadow := disallowShadowingBelow.LessEq(extKey.Timestamp) && bytes.Equal(extValueRaw, sstValueRaw)
                        if !allowShadow {
                                return kvpb.NewKeyCollisionError(sstKey.Key, sstValueRaw)
                        }
                }

                // If the existing key has a timestamp at or above the SST key, return a
                // WriteTooOldError. Normally this could cause a transactional request to be
                // automatically retried after a read refresh, which we would only want to
                // do if AddSSTable had SSTTimestampToRequestTimestamp set, but AddSSTable
                // cannot be used in transactions so we don't need to check.
                if sstKey.Timestamp.LessEq(extKey.Timestamp) {
                        return kvpb.NewWriteTooOldError(
                                sstKey.Timestamp, extKey.Timestamp.Next(), sstKey.Key)
                }

                // If we are shadowing an existing key, we must update the stats accordingly
                // to take into account the existing KV pair. The key is considered deleted
                // at the lowest timestamp where there was an mvcc point tombstone, or an
                // overlapping range tombstone or new point key.
                if extValueIsTombstone {
                        statsDiff.AgeTo(extKey.Timestamp.WallTime)
                } else if sstRangeKeyVersion.Timestamp.Compare(extKey.Timestamp) >= 0 && sstRangeKeyVersion.Timestamp.Compare(sstKey.Timestamp) < 0 {
                        statsDiff.AgeTo(sstRangeKeyVersion.Timestamp.WallTime)
                } else {
                        statsDiff.AgeTo(sstKey.Timestamp.WallTime)
                }
                statsDiff.KeyCount--
                statsDiff.KeyBytes -= int64(len(extKey.Key) + 1)
                if !extValueIsTombstone {
                        statsDiff.LiveCount--
                        statsDiff.LiveBytes -= int64(len(extKey.Key) + 1)
                        statsDiff.LiveBytes -= int64(len(extValueRaw)) + MVCCVersionTimestampSize
                }
                return nil
        }

        sstIter.SeekGE(start)
        sstOK, sstErr := sstIter.Valid()
        var extOK bool
        var extErr error
        var sstPrevRangeKeys, extPrevRangeKeys MVCCRangeKeyStack
        var sstFirstRangeKey MVCCRangeKeyStack
        var extPrevKey, extPrevDeletedKey MVCCKey

        if usePrefixSeek {
                // In the case of prefix seeks, do not look at engine iter exhaustion. This
                // is because the engine prefix iterator could be exhausted when it has
                // iterated past its prefix, even if there are other keys after the prefix
                // that should be checked.
                for sstErr == nil && sstOK {
                        if err := ctx.Err(); err != nil {
                                return enginepb.MVCCStats{}, err
                        }
                        // extIter is a prefix iterator; it is expected to skip keys that belong
                        // to different prefixes. Only iterate along the sst iterator, and re-seek
                        // extIter each time.
                        extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                        extOK, extErr = extIter.Valid()
                        if extErr != nil {
                                break
                        }
                        if !extOK {
                                // There is no key in extIter matching this prefix. Check the next key in
                                // sstIter. Note that we can't just use an exhausted extIter as a sign that
                                // we are done with the loop; extIter is a prefix iterator and could
                                // have keys after the current prefix that it will not return unless
                                // re-seeked.
                                sstIter.NextKey()
                                sstOK, sstErr = sstIter.Valid()
                                continue
                        }

                        // TODO(sumeer): extValueRaw is not always needed below. In many cases
                        // MVCCValueLenAndIsTombstone() suffices. This will require some
                        // rearrangement of the logic in compareForCollision. This is not a
                        // pressing optimization since currently the value is cheap to retrieve
                        // for the latest version of a key, and we are seeing the latest version
                        // because of the extIter.SeekGE call above.
                        extValueRaw, err := extIter.UnsafeValue()
                        if err != nil {
                                return enginepb.MVCCStats{}, err
                        }
                        sstValueRaw, err := sstIter.UnsafeValue()
                        if err != nil {
                                return enginepb.MVCCStats{}, err
                        }
                        extKey := extIter.UnsafeKey()
                        sstKey := sstIter.UnsafeKey()

                        // We just seeked the engine iter. If it has a mismatching prefix, the
                        // iterator is not obeying its contract.
                        if !bytes.Equal(extKey.Key, sstKey.Key) {
                                return enginepb.MVCCStats{}, errors.Errorf("prefix iterator returned mismatching prefix: %s != %s", extKey.Key, sstKey.Key)
                        }

                        if err := compareForCollision(sstKey, extKey, sstValueRaw, extValueRaw, MVCCRangeKeyVersion{}); err != nil {
                                return enginepb.MVCCStats{}, err
                        }

                        sstIter.NextKey()
                        sstOK, sstErr = sstIter.Valid()
                }
        } else if sstOK {
                extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                extOK, extErr = extIter.Valid()
        }

        for !usePrefixSeek && sstErr == nil && sstOK && extOK && extErr == nil {
                if err := ctx.Err(); err != nil {
                        return enginepb.MVCCStats{}, err
                }
                extHasPoint, extHasRange := extIter.HasPointAndRange()
                sstHasPoint, sstHasRange := sstIter.HasPointAndRange()
                var extRangeKeys, sstRangeKeys MVCCRangeKeyStack
                if sstHasRange {
                        sstRangeKeys = sstIter.RangeKeys()
                        if sstFirstRangeKey.IsEmpty() {
                                sstFirstRangeKey = sstRangeKeys.Clone()
                        }
                }
                if extHasRange {
                        extRangeKeys = extIter.RangeKeys()
                }
                sstRangeKeysChanged := sstHasRange && !sstPrevRangeKeys.Bounds.Equal(sstRangeKeys.Bounds)
                extRangeKeysChanged := extHasRange && !extPrevRangeKeys.Bounds.Equal(extRangeKeys.Bounds)
                extKeyChanged := !extPrevKey.Equal(extIter.UnsafeKey())
                if extKeyChanged {
                        extIter.UnsafeKey().CloneInto(&extPrevKey)
                }
                // Case where SST and engine both have range keys at the current iterator
                // points. The SST range keys must be newer than engine range keys.
                if extHasRange && sstHasRange {
                        // Check if the oldest SST range key conflicts with the newest ext
                        // range key.
                        if (sstRangeKeysChanged || extRangeKeysChanged) && sstRangeKeys.Bounds.Overlaps(extRangeKeys.Bounds) {
                                sstTombstone := sstRangeKeys.Versions[len(sstRangeKeys.Versions)-1]
                                if sstTombstone.Timestamp.Less(extRangeKeys.Versions[0].Timestamp) {
                                        // Conflict. We can't slide an MVCC range tombstone below an
                                        // existing MVCC range tombstone in the engine.
                                        return enginepb.MVCCStats{}, kvpb.NewWriteTooOldError(
                                                sstTombstone.Timestamp, extRangeKeys.Versions[0].Timestamp.Next(), sstRangeKeys.Bounds.Key)
                                }
                                if !extRangeKeys.Versions[0].Timestamp.Less(sstTombstone.Timestamp) {
                                        // Check for idempotent range key additions. The top
                                        // len(sstRangeKeys.Versions) timestamps must match between the two range
                                        // key stacks.
                                        extTombstones := extRangeKeys.Versions.Clone()
                                        extTombstones.Trim(sstTombstone.Timestamp, hlc.MaxTimestamp)
                                        isIdempotent := extTombstones.Equal(sstRangeKeys.Versions)
                                        if ok := allowIdempotentHelper(extRangeKeys.Versions[0].Timestamp); !ok || !isIdempotent {
                                                // Idempotence is either not allowed or there's a conflict.
                                                return enginepb.MVCCStats{}, kvpb.NewWriteTooOldError(
                                                        sstTombstone.Timestamp, extRangeKeys.Versions[0].Timestamp.Next(), sstRangeKeys.Bounds.Key)
                                        }
                                }
                        }
                }
                // Case where the engine has a range key that might delete the current SST
                // point.
                if sstHasPoint && extHasRange {
                        sstKey := sstIter.UnsafeKey()
                        if extRangeKeys.Covers(sstKey) {
                                // A range tombstone in the engine deletes this SST key. Return
                                // a WriteTooOldError.
                                return enginepb.MVCCStats{}, kvpb.NewWriteTooOldError(
                                        sstKey.Timestamp, extRangeKeys.Versions[0].Timestamp.Next(), sstKey.Key)
                        }
                }
                // Check that the oldest SST range key is not underneath the current ext
                // point key. If requested (with disallowShadowing or
                // disallowShadowingBelow), check that the newest SST range tombstone does
                // not shadow a live key.
                if sstHasRange && extHasPoint {
                        sstBottomTombstone := sstRangeKeys.Versions[len(sstRangeKeys.Versions)-1]
                        sstTopTombstone := sstRangeKeys.Versions[0]
                        extKey := extIter.UnsafeKey()
                        extValueLen, extValueIsTombstone := 0, false
                        if extKey.IsValue() {
                                extValueLen, extValueIsTombstone, err = extIter.MVCCValueLenAndIsTombstone()
                                if err != nil {
                                        return enginepb.MVCCStats{}, err
                                }
                        } else {
                                return enginepb.MVCCStats{}, errors.AssertionFailedf("found intent after ScanLocks call")
                        }

                        if sstBottomTombstone.Timestamp.LessEq(extKey.Timestamp) {
                                // Conflict.
                                return enginepb.MVCCStats{}, kvpb.NewWriteTooOldError(
                                        sstBottomTombstone.Timestamp, extKey.Timestamp.Next(), sstRangeKeys.Bounds.Key)
                        }
                        if sstRangeKeys.Covers(extKey) {
                                // Check if shadowing a live key is allowed. Deleting a live key counts
                                // as a shadow.
                                extValueDeleted := extHasRange && extRangeKeys.Covers(extKey)
                                if !extValueIsTombstone && !extValueDeleted && !disallowShadowingBelow.IsEmpty() {
                                        // Note that we don't check for value equality here, unlike in the
                                        // point key shadow case. This is because a range key and a point key
                                        // by definition have different values.
                                        return enginepb.MVCCStats{}, errors.Errorf(
                                                "ingested range key collides with an existing one: %s", sstTopTombstone)
                                }
                                if !extValueDeleted {
                                        sstRangeKeyVersion, ok := sstRangeKeys.FirstAtOrAbove(extKey.Timestamp)
                                        if !ok {
                                                return enginepb.MVCCStats{}, errors.AssertionFailedf("expected range tombstone above timestamp %v", extKey.Timestamp)
                                        }
                                        sstPointShadowsExtPoint := sstHasPoint && sstIter.UnsafeKey().Key.Equal(extKey.Key)
                                        if (extKeyChanged || sstRangeKeysChanged) && !sstPointShadowsExtPoint && !extKey.Equal(extPrevDeletedKey) {
                                                extKey.CloneInto(&extPrevDeletedKey)
                                                statsDiff.Add(updateStatsOnRangeKeyCover(
                                                        sstRangeKeyVersion.Timestamp, extKey, extValueLen, extValueIsTombstone))
                                        } else if extKey.Equal(extPrevDeletedKey) && sstPointShadowsExtPoint {
                                                // This is either a conflict, shadow, or idempotent operation.
                                                // Subtract the RangeKeyCover stats diff from the last iteration, as
                                                // compareForCollision will account for the shadow.
                                                statsDiff.Subtract(updateStatsOnRangeKeyCover(
                                                        sstRangeKeyVersion.Timestamp, extKey, extValueLen, extValueIsTombstone))
                                        }
                                }
                        }
                }

                if sstRangeKeysChanged {
                        if extHasRange && extRangeKeys.Bounds.Overlaps(sstRangeKeys.Bounds) {
                                mergedIntoExisting := false
                                overlappingSection := sstRangeKeys.Bounds
                                switch sstRangeKeys.Bounds.Key.Compare(extRangeKeys.Bounds.Key) {
                                case -1:
                                        // sstRangeKey starts earlier than extRangeKey. Add a fragment
                                        overlappingSection.Key = extRangeKeys.Bounds.Key
                                        statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                        statsDiff.RangeKeyBytes += int64(EncodedMVCCKeyPrefixLength(extRangeKeys.Bounds.Key))
                                        addedFragment := MVCCRangeKeyStack{
                                                Bounds:   roachpb.Span{Key: sstRangeKeys.Bounds.Key, EndKey: extRangeKeys.Bounds.Key},
                                                Versions: sstRangeKeys.Versions,
                                        }
                                        if addedFragment.CanMergeRight(extRangeKeys) {
                                                statsDiff.Add(updateStatsOnRangeKeyMerge(extRangeKeys.Bounds.Key, sstRangeKeys.Versions))
                                                // Remove the contribution for the end key.
                                                statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(sstRangeKeys.Bounds.EndKey))
                                                mergedIntoExisting = true
                                        } else {
                                                // Add the sst range key versions again, to account for the overlap
                                                // with extRangeKeys.
                                                updatedStack := extRangeKeys
                                                updatedStack.Versions = extRangeKeys.Versions.Clone()
                                                for i, v := range sstRangeKeys.Versions {
                                                        if i == 0 {
                                                                // We do this dance to make updatedStack.Versions.Newest() == v. This
                                                                // is necessary to keep GCBytesAge calculations correct, we don't
                                                                // want updateStatsOnRangeKeyPutVersion to "lift" the GCBytesAge
                                                                // contribution of extRangeKeys' bounds. We will do that later.
                                                                // We only want it to add the version.
                                                                oldVersions := updatedStack.Versions
                                                                updatedStack.Versions = append(MVCCRangeKeyVersions{v}, oldVersions...)
                                                        }
                                                        statsDiff.Add(updateStatsOnRangeKeyPutVersion(updatedStack, v))
                                                }
                                        }
                                case 0:
                                        // Same start key. No need to encode the start key again.
                                        statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                        statsDiff.RangeKeyCount--
                                        statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(sstRangeKeys.Bounds.Key))
                                case 1:
                                        // This SST start key fragments the ext range key. Unless the ext
                                        // range key has already been fragmented at this point by sstPrevRangeKey.
                                        if sstPrevRangeKeys.IsEmpty() || !sstPrevRangeKeys.Bounds.EndKey.Equal(sstRangeKeys.Bounds.Key) {
                                                statsDiff.Add(UpdateStatsOnRangeKeySplit(sstRangeKeys.Bounds.Key, extRangeKeys.Versions))
                                        }
                                        // No need to re-encode the start key, as UpdateStatsOnRangeKeySplit has already
                                        // done that for us.
                                        statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                        statsDiff.RangeKeyCount--
                                        statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(sstRangeKeys.Bounds.Key))
                                }
                                if extRangeKeys.Bounds.EndKey.Compare(sstRangeKeys.Bounds.EndKey) < 0 {
                                        overlappingSection.EndKey = extRangeKeys.Bounds.EndKey
                                }
                                // Move up the GCBytesAge contribution of the overlapping section from
                                // extRangeKeys.Newest up to sstRangeKeys.Newest.
                                {
                                        keyBytes := int64(EncodedMVCCKeyPrefixLength(overlappingSection.Key)) +
                                                int64(EncodedMVCCKeyPrefixLength(overlappingSection.EndKey))
                                        statsDiff.AgeTo(extRangeKeys.Newest().WallTime)
                                        statsDiff.RangeKeyBytes -= keyBytes
                                        statsDiff.AgeTo(sstRangeKeys.Newest().WallTime)
                                        statsDiff.RangeKeyBytes += keyBytes
                                }

                                // Check if the overlapping part of sstRangeKeys and extRangeKeys has
                                // idempotent versions. We already know this isn't a conflict, as that
                                // check happened earlier.
                                if !mergedIntoExisting {
                                        idempotentIdx := 0
                                        for _, v := range sstRangeKeys.Versions {
                                                if idempotentIdx >= len(extRangeKeys.Versions) || !v.Equal(extRangeKeys.Versions[idempotentIdx]) {
                                                        break
                                                }
                                                // Subtract stats for this version, as it already exists in the
                                                // engine.
                                                overlappingStack := MVCCRangeKeyStack{
                                                        Bounds:   overlappingSection,
                                                        Versions: sstRangeKeys.Versions,
                                                }
                                                statsDiff.Subtract(updateStatsOnRangeKeyPutVersion(overlappingStack, v))
                                                idempotentIdx++
                                        }
                                        switch extRangeKeys.Bounds.EndKey.Compare(sstRangeKeys.Bounds.EndKey) {
                                        case +1:
                                                statsDiff.Add(UpdateStatsOnRangeKeySplit(sstRangeKeys.Bounds.EndKey, extRangeKeys.Versions))
                                                // Remove the contribution for the end key.
                                                statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                                statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(sstRangeKeys.Bounds.EndKey))
                                        case 0:
                                                // Remove the contribution for the end key.
                                                statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                                statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(sstRangeKeys.Bounds.EndKey))
                                        case -1:
                                                statsDiff.Add(UpdateStatsOnRangeKeySplit(extRangeKeys.Bounds.EndKey, sstRangeKeys.Versions))
                                                // Remove the contribution for the end key.
                                                statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                                statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(extRangeKeys.Bounds.EndKey))
                                        }
                                }
                        }
                        if extHasRange && sstRangeKeys.CanMergeRight(extRangeKeys) {
                                statsDiff.Add(updateStatsOnRangeKeyMerge(sstRangeKeys.Bounds.EndKey, sstRangeKeys.Versions))
                        }
                        if !extPrevRangeKeys.IsEmpty() && extPrevRangeKeys.CanMergeRight(sstRangeKeys) {
                                statsDiff.Add(updateStatsOnRangeKeyMerge(sstRangeKeys.Bounds.Key, sstRangeKeys.Versions))
                        } else if !extHasRange || extRangeKeys.Bounds.Key.Compare(sstRangeKeys.Bounds.Key) >= 0 {
                                // Complication: we need to check if there's a range key to the left of
                                // this range key that we could merge with. The only foolproof way
                                // to do that is to copy the current iterator position in its entirety,
                                // call PeekRangeKeyLeft, and then SeekGE the engine iterator back
                                // to its original position.
                                savedExtKey := extIter.UnsafeKey().Clone()
                                pos, peekedExtRangeKeys, err := PeekRangeKeysLeft(extIter, sstRangeKeys.Bounds.Key)
                                if err != nil {
                                        return enginepb.MVCCStats{}, err
                                }
                                if pos == 0 && peekedExtRangeKeys.CanMergeRight(sstRangeKeys) {
                                        statsDiff.Add(updateStatsOnRangeKeyMerge(sstRangeKeys.Bounds.Key, sstRangeKeys.Versions))
                                }
                                extIter.SeekGE(savedExtKey)
                                // After seeking, the old buffers have been invalidated.
                                // Re-retrieve the buffers.
                                if extHasRange {
                                        extRangeKeys = extIter.RangeKeys()
                                }
                        }
                        if extRangeKeysChanged && !sstPrevRangeKeys.IsEmpty() && sstPrevRangeKeys.Bounds.Overlaps(extRangeKeys.Bounds) {
                                // Because we always re-seek the extIter after every sstIter step,
                                // it is possible that we missed an overlap between extRangeKeys and
                                // sstPrevRangeKeys. Account for that here by adding the version stats
                                // for sstPrevRangeKeys.
                                updatedStack := extRangeKeys
                                updatedStack.Versions = extRangeKeys.Versions.Clone()
                                for i, v := range sstPrevRangeKeys.Versions {
                                        statsDiff.Add(updateStatsOnRangeKeyPutVersion(updatedStack, v))
                                        if i == 0 {
                                                // We do this dance to make updatedStack.Versions.Newest() == v. This
                                                // is necessary to keep GCBytesAge calculations correct.
                                                oldVersions := updatedStack.Versions
                                                updatedStack.Versions = append(MVCCRangeKeyVersions{v}, oldVersions...)
                                        }
                                }
                        }
                        sstPrevRangeKeys = sstRangeKeys.Clone()
                }
                if extRangeKeysChanged {
                        // Note that we exclude sstRangeKeysChanged below, as this case only
                        // accounts for additional ext range keys that this SST range key stack
                        // could be adding versions to. The very first ext range key stack that
                        // this sst stack contributes stats to is already accounted by the
                        // sstRangeKeysChanged conditional above.
                        if sstHasRange && sstRangeKeys.Bounds.Overlaps(extRangeKeys.Bounds) && !sstRangeKeysChanged {
                                idempotentIdx := 0
                                updatedStack := extRangeKeys
                                if sstRangeKeys.Bounds.EndKey.Compare(extRangeKeys.Bounds.EndKey) < 0 {
                                        updatedStack.Bounds.EndKey = sstRangeKeys.Bounds.EndKey
                                }
                                updatedStack.Versions = extRangeKeys.Versions.Clone()
                                for i, v := range sstRangeKeys.Versions {
                                        if len(extRangeKeys.Versions) > idempotentIdx && v.Timestamp.Equal(extRangeKeys.Versions[idempotentIdx].Timestamp) {
                                                // Skip this version, as it already exists in the engine.
                                                idempotentIdx++
                                                continue
                                        }
                                        statsDiff.Add(updateStatsOnRangeKeyPutVersion(updatedStack, v))
                                        if i == idempotentIdx {
                                                // We do this dance to make updatedStack.Versions.Newest() == v. This
                                                // is necessary to keep GCBytesAge calculations correct.
                                                oldVersions := updatedStack.Versions
                                                updatedStack.Versions = append(MVCCRangeKeyVersions{v}, oldVersions...)
                                        }
                                }
                                // Check if this ext range key is going to fragment the SST range key.
                                if sstRangeKeys.Bounds.Key.Compare(extRangeKeys.Bounds.Key) < 0 && !extRangeKeys.Versions.Equal(sstRangeKeys.Versions) &&
                                        (extPrevRangeKeys.IsEmpty() || !extPrevRangeKeys.Bounds.EndKey.Equal(extRangeKeys.Bounds.Key)) {
                                        // Add a fragment end key at extRangeKeys.Bounds.Key, to finish off
                                        // the sst fragment at that point. Note that we've already "lifted up"
                                        // the GCBytesAge of the overlapping parts of extRangeKeys and
                                        // sstRangeKeys when we did the call to updateStatsOnRangeKeyPutVersion
                                        // in the for loop above.
                                        statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                        statsDiff.RangeKeyBytes += int64(EncodedMVCCKeyPrefixLength(extRangeKeys.Bounds.Key))
                                        updatedStack := extRangeKeys
                                        updatedStack.Versions = extRangeKeys.Versions.Clone()
                                } else if !extPrevRangeKeys.IsEmpty() && extPrevRangeKeys.Bounds.EndKey.Equal(extRangeKeys.Bounds.Key) {
                                        updatedStack := extRangeKeys
                                        updatedStack.Versions = extRangeKeys.Versions.Clone()
                                        // Remove the contribution for versions, as that's already been added.
                                        for i, v := range sstRangeKeys.Versions {
                                                if i == 0 {
                                                        // We do this dance to make updatedStack.Versions.Newest() == v. This
                                                        // is necessary to keep GCBytesAge calculations correct.
                                                        oldVersions := updatedStack.Versions
                                                        updatedStack.Versions = append(MVCCRangeKeyVersions{v}, oldVersions...)
                                                }
                                                statsDiff.Subtract(updateStatsOnRangeKeyPutVersion(updatedStack, v))
                                        }
                                        statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                        statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(extRangeKeys.Bounds.Key))
                                        statsDiff.RangeKeyCount--
                                }
                                // Check if this ext range key is going to be fragmented by the sst
                                // range key's end key.
                                switch extRangeKeys.Bounds.EndKey.Compare(sstRangeKeys.Bounds.EndKey) {
                                case +1:
                                        if !extRangeKeys.Versions.Equal(sstRangeKeys.Versions) {
                                                // This SST range key will fragment this ext range key.
                                                statsDiff.Add(UpdateStatsOnRangeKeySplit(sstRangeKeys.Bounds.EndKey, extRangeKeys.Versions))
                                        }
                                        // Remove the contribution for the end key.
                                        statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                        statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(sstRangeKeys.Bounds.EndKey))
                                case 0:
                                        // Remove the contribution for the end key.
                                        statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                        statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(sstRangeKeys.Bounds.EndKey))
                                case -1:
                                        if !extRangeKeys.Versions.Equal(sstRangeKeys.Versions) {
                                                // This ext range key's end will fragment this sst range key.
                                                statsDiff.Add(UpdateStatsOnRangeKeySplit(extRangeKeys.Bounds.EndKey, sstRangeKeys.Versions))
                                                statsDiff.AgeTo(sstRangeKeys.Versions.Newest().WallTime)
                                                statsDiff.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(extRangeKeys.Bounds.EndKey))
                                        }
                                }
                        }
                        if !sstPrevRangeKeys.IsEmpty() && sstPrevRangeKeys.CanMergeRight(extRangeKeys) && !sstRangeKeysChanged {
                                // We exclude !sstRangeKeysChanged to avoid double-counting this merge.
                                statsDiff.Add(updateStatsOnRangeKeyMerge(sstPrevRangeKeys.Bounds.EndKey, extRangeKeys.Versions))
                        }
                        extPrevRangeKeys = extRangeKeys.Clone()
                }

                extKey := extIter.UnsafeKey()
                sstValueRaw, err := sstIter.UnsafeValue()
                if err != nil {
                        return enginepb.MVCCStats{}, err
                }
                sstKey := sstIter.UnsafeKey()

                // Keep seeking the iterators until both keys are equal.
                if cmp := bytes.Compare(extKey.Key, sstKey.Key); cmp < 0 {
                        // sstIter is further ahead. This should never happen; we always seek
                        // extIter after seeking/nexting sstIter.
                        return enginepb.MVCCStats{}, errors.AssertionFailedf("expected engine iter to be ahead of sst iter")
                } else if cmp > 0 && sstHasPoint && !extHasRange {
                        // We exclude !sstHasPoint above in case we were at a range key pause
                        // point that matches extKey. In that case, the below SeekGE would make
                        // no forward progress.
                        sstIter.SeekGE(MVCCKey{Key: extKey.Key})
                        sstOK, sstErr = sstIter.Valid()
                        if sstOK {
                                // Seeks on the engine are expensive. Try Next()ing if we're very close
                                // to the sst key (which we might be).
                                nextsUntilSeek := numNextsBeforeSeek
                                rangeKeyChanged := false
                                for extOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 {
                                        extIter.NextKey()
                                        extOK, _ = extIter.Valid()
                                        rangeKeyChanged = rangeKeyChanged || (extOK && extIter.RangeKeyChanged())
                                        nextsUntilSeek--
                                        if nextsUntilSeek <= 0 {
                                                break
                                        }
                                }
                                // Handle moving from a range key to an exhausted iterator.
                                rangeKeyChanged = rangeKeyChanged || (!extOK && !extPrevRangeKeys.IsEmpty())
                                // If we havent't reached the SST key yet, seek to it. Otherwise, if we
                                // stepped past it but the range key changed we have to seek back to it,
                                // since we could otherwise have missed a range key that overlapped
                                // the SST key.
                                extCmp := 1
                                if extOK {
                                        extCmp = extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key)
                                }
                                if extCmp < 0 || (extCmp > 0 && rangeKeyChanged) {
                                        extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                }
                        }
                        extOK, extErr = extIter.Valid()
                        continue
                }

                extValueDeletedByRange := extHasRange && extHasPoint && extRangeKeys.Covers(extKey)
                if extKey.Key.Equal(sstKey.Key) {
                        if sstHasPoint && extHasPoint && !extValueDeletedByRange {
                                // TODO(sumeer): extValueRaw is not always needed below. In many cases
                                // MVCCValueLenAndIsTombstone() suffices. This will require some
                                // rearrangement of the logic in compareForCollision.
                                extValueRaw, err := extIter.UnsafeValue()
                                if err != nil {
                                        return enginepb.MVCCStats{}, err
                                }
                                var sstRangeKeyVersion MVCCRangeKeyVersion
                                if sstHasRange && sstRangeKeys.Covers(extKey) {
                                        sstRangeKeyVersion, _ = sstRangeKeys.FirstAtOrAbove(extKey.Timestamp)
                                }
                                if err := compareForCollision(sstKey, extKey, sstValueRaw, extValueRaw, sstRangeKeyVersion); err != nil {
                                        return enginepb.MVCCStats{}, err
                                }
                        } else if sstHasPoint && extValueDeletedByRange {
                                // Don't double-count the current key.
                                var deletedAt hlc.Timestamp
                                if _, isTombstone, err := extIter.MVCCValueLenAndIsTombstone(); err != nil {
                                        return enginepb.MVCCStats{}, err
                                } else if isTombstone {
                                        deletedAt = extKey.Timestamp
                                } else {
                                        version, _ := extRangeKeys.Versions.FirstAtOrAbove(extKey.Timestamp)
                                        deletedAt = version.Timestamp
                                }
                                statsDiff.AgeTo(deletedAt.WallTime)
                                statsDiff.KeyCount--
                                statsDiff.KeyBytes -= int64(len(extKey.Key) + 1)
                        }
                }

                // Fast path with sstTimestamp set and a common case of import cancellation.
                // Since we use range key masking, we can just Next() the ext iterator
                // past its range key.
                if sstTimestamp.IsSet() && extHasRange && !extHasPoint && !sstHasRange {
                        if extRangeKeys.Newest().Less(sstTimestamp) {
                                // All range key versions are below the request timestamp. We can seek
                                // past the range key, as all SST points/ranges are going to be above
                                // this range key.
                                extIter.Next()
                                extOK, extErr = extIter.Valid()
                                if !extOK {
                                        break
                                }

                                sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key})
                                sstOK, sstErr = sstIter.Valid()
                                if sstOK {
                                        // Seeks on the engine are expensive. Try Next()ing if we're very close
                                        // to the sst key (which we might be).
                                        nextsUntilSeek := numNextsBeforeSeek
                                        rangeKeyChanged := false
                                        for extOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 {
                                                extIter.NextKey()
                                                extOK, _ = extIter.Valid()
                                                rangeKeyChanged = rangeKeyChanged || (extOK && extIter.RangeKeyChanged())
                                                nextsUntilSeek--
                                                if nextsUntilSeek <= 0 {
                                                        break
                                                }
                                        }
                                        // Handle moving from a range key to an exhausted iterator.
                                        rangeKeyChanged = rangeKeyChanged || (!extOK && !extPrevRangeKeys.IsEmpty())
                                        // If we havent't reached the SST key yet, seek to it. Otherwise, if we
                                        // stepped past it but the range key changed we have to seek back to it,
                                        // since we could otherwise have missed a range key that overlapped
                                        // the SST key.
                                        extCmp := 1
                                        if extOK {
                                                extCmp = extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key)
                                        }
                                        if extCmp < 0 || (extCmp > 0 && rangeKeyChanged) {
                                                extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                        }
                                }
                                extOK, extErr = extIter.Valid()
                                continue
                        }
                }
                steppedExtIter := false
                // Before Next-ing the SST iter, if it contains any range keys, check if both:
                // 1) the next SST key takes us outside the current SST range key
                // 2) the next ext key overlaps with the current sst range key
                // In that case, we want to step the ext iter forward and seek the sst
                // iter back at it.
                //
                // This handles cases like this, where the b-d range key could get ignored:
                // sst:  a-----c     e
                // ext:  a  b-----d
                if sstHasRange && sstRangeKeys.Bounds.ContainsKey(extKey.Key) {
                        // Check for condition 1.
                        //
                        // NB: sstPrevRangeKeys is already a clone of the current sstRangeKeys.
                        sstPrevKey := sstIter.UnsafeKey().Clone()
                        sstRangeKeys = sstPrevRangeKeys
                        if sstHasPoint {
                                sstIter.NextKey()
                        } else {
                                sstIter.Next()
                        }
                        sstOK, _ = sstIter.Valid()
                        if !sstOK || sstPrevRangeKeys.Bounds.ContainsKey(sstIter.UnsafeKey().Key) {
                                // Restore the sst iter and continue on. The below Next()ing logic is
                                // sufficient in this case.
                                sstIter.SeekGE(sstPrevKey)
                                sstOK, sstErr = sstIter.Valid()
                        } else {
                                extIter.UnsafeKey().CloneInto(&extPrevKey)
                                if extHasPoint {
                                        extIter.NextKey()
                                } else {
                                        extIter.Next()
                                }
                                extOK, extErr = extIter.Valid()
                                if extOK && sstPrevRangeKeys.Bounds.ContainsKey(extIter.UnsafeKey().Key) {
                                        // Skip the Next()ing logic below so we can check for overlaps
                                        // between this ext key and the same sst key. Note that we need
                                        // to restore the sst iter back to the same range key pause point.
                                        steppedExtIter = true
                                        sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key})
                                        sstOK, sstErr = sstIter.Valid()
                                } else {
                                        // Special case: if extIter is at a range key that sstPrevRangeKeys
                                        // merges into, *and* the next SST key is outside the bounds of this
                                        // SST range key, then account for that merge. If we hadn't excluded
                                        // the case where the current SST key is within its own range key
                                        // bounds, we'd have double-counted the merge when we did the collision
                                        // check.
                                        if extOK && sstOK && !sstPrevRangeKeys.Bounds.ContainsKey(sstIter.UnsafeKey().Key) {
                                                _, extHasRange = extIter.HasPointAndRange()
                                                if extHasRange && sstPrevRangeKeys.CanMergeRight(extIter.RangeKeys()) {
                                                        statsDiff.Add(updateStatsOnRangeKeyMerge(sstPrevRangeKeys.Bounds.EndKey, sstPrevRangeKeys.Versions))
                                                }
                                        }
                                        // Fall back to the below Next()ing logic.
                                        sstIter.SeekGE(sstPrevKey)
                                        sstOK, sstErr = sstIter.Valid()
                                        extIter.SeekGE(extPrevKey)
                                        extOK, extErr = extIter.Valid()
                                        // We could have reset extHasRange above, so set it back.
                                        _, extHasRange = extIter.HasPointAndRange()
                                }
                        }
                }
                // Calling NextKey is only safe if both iterators are at a point key. This is
                // because there could be a point key hiding behind the range key that we're
                // currently at, and NextKey() would skip over it.
                //
                // The below logic accounts for all combinations of point keys and range
                // keys being present and not present at the current iterator positions.
                // Note that SeekGE()s pause at the seek key if there's a covering range key
                // however we need to take care to not go into an infinite loop of seeks
                // if we step one iterator past a transient range key pausing point and
                // seek the other, and on the next iteration, step the second iterator
                // and seek the former iterator back to the same point.
                if sstHasPoint && extHasPoint && !steppedExtIter {
                        maybeReseekExtIter := false
                        if sstHasRange && extHasRange {
                                // Step both iterators. Seek whichever one lands further ahead.
                                extIter.NextKey()
                                extOK, extErr = extIter.Valid()
                                if extErr != nil {
                                        return enginepb.MVCCStats{}, extErr
                                }
                                sstIter.NextKey()
                                sstOK, sstErr = sstIter.Valid()
                                if sstOK && (!extOK || extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) > 0) {
                                        extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                        extOK, extErr = extIter.Valid()
                                } else if extOK && (!sstOK || sstIter.UnsafeKey().Key.Compare(extIter.UnsafeKey().Key) > 0) {
                                        // sst iter > ext iter. Seek sst iter back. Then re-seek extIter
                                        // if sst iter is still ahead of ext iter.
                                        sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key})
                                        sstOK, sstErr = sstIter.Valid()
                                        maybeReseekExtIter = true
                                }
                        } else if sstHasRange {
                                // Step the ext iter instead of the sst iter. This prevents us from
                                // missing any ext keys that could overlap with this sst range key.
                                // The downside of doing this is that we have to reseek both iterators
                                // right after, to preserve the sst iterator < ext iterator invariant.
                                extIter.NextKey()
                                extOK, extErr = extIter.Valid()
                                if extOK {
                                        sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key})
                                        sstOK, sstErr = sstIter.Valid()
                                        maybeReseekExtIter = true
                                }
                        } else {
                                sstIter.NextKey()
                                sstOK, sstErr = sstIter.Valid()
                                maybeReseekExtIter = true
                        }
                        if extErr != nil {
                                return enginepb.MVCCStats{}, extErr
                        }
                        if maybeReseekExtIter && sstOK && (!extOK || extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0) {
                                extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                extOK, extErr = extIter.Valid()
                        }
                } else if !steppedExtIter {
                        oldKey := sstIter.UnsafeKey().Clone()
                        oldExtKey := extIter.UnsafeKey().Clone()
                        if sstHasPoint { // !extHasPoint
                                // Check if ext has a point at this key. If not, NextKey() on sstIter
                                // and seek extIter.
                                extIter.Next()
                                steppedExtIter = true
                                extOK, extErr = extIter.Valid()
                                if extErr != nil {
                                        return enginepb.MVCCStats{}, extErr
                                }
                                if !extOK || !extIter.UnsafeKey().Key.Equal(oldExtKey.Key) {
                                        // extIter either went out of bounds or stepped one key ahead. If the
                                        // ext iter is at a new key that's less than the next sst key, re-seek
                                        // the sst iter. If not, re-seek the ext iter at the next sst key.
                                        sstIter.NextKey()
                                        sstOK, sstErr = sstIter.Valid()

                                        if sstOK && extOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 {
                                                sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key})
                                                sstOK, sstErr = sstIter.Valid()
                                                if sstOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 {
                                                        extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                                        extOK, extErr = extIter.Valid()
                                                }
                                        } else if sstOK {
                                                extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                                extOK, extErr = extIter.Valid()
                                        }
                                }
                                // If extIter found a point key at the same MVCC Key, we still need
                                // to check for conflicts against it.
                        } else if extHasPoint { // !sstHasPoint
                                // Similar logic as above, but with the iterators swapped. The one key
                                // difference is what we do when the sstIter changes keys.
                                sstIter.Next()
                                sstOK, sstErr = sstIter.Valid()
                                if sstErr != nil {
                                        return enginepb.MVCCStats{}, sstErr
                                }
                                if sstOK && !sstIter.UnsafeKey().Key.Equal(oldKey.Key) && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 {
                                        // sstIter stepped one key ahead. Re-seek both iterators at the next
                                        // ext key.
                                        extIter.NextKey()
                                        extOK, extErr = extIter.Valid()
                                        if extOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 {
                                                sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key})
                                                sstOK, sstErr = sstIter.Valid()
                                        }
                                        if sstOK {
                                                extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                                extOK, extErr = extIter.Valid()
                                        }
                                }
                                // If sstIter found a point key at the same MVCC Key, we still need
                                // to check for conflicts against it.
                        } else { // sstHasRange && extHasRange && !sstHasPoint && !extHasPoint
                                // Step both iterators forward. If one iterator stays at the same key,
                                // seek the other one back to the same key.
                                //
                                // Note that we can't do this if either sstHasPoint or extHasPoint, as
                                // this logic does not guarantee forward progress in those cases.
                                sstIter.Next()
                                sstOK, sstErr = sstIter.Valid()
                                sstChangedKeys := !sstOK || !sstIter.UnsafeKey().Key.Equal(oldKey.Key)
                                extIter.Next()
                                steppedExtIter = true
                                extOK, extErr = extIter.Valid()
                                extChangedKeys := !extOK || !extIter.UnsafeKey().Key.Equal(oldExtKey.Key)
                                if sstOK && extOK && sstChangedKeys && extChangedKeys &&
                                        extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 {
                                        sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key})
                                        sstOK, sstErr = sstIter.Valid()
                                        if sstOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 {
                                                extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                                extOK, extErr = extIter.Valid()
                                        }
                                } else if sstOK && sstChangedKeys && !extOK {
                                        extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                        extOK, extErr = extIter.Valid()
                                } else {
                                        if sstChangedKeys && !extChangedKeys {
                                                sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key})
                                                sstOK, sstErr = sstIter.Valid()
                                                if sstOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 {
                                                        extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                                        extOK, extErr = extIter.Valid()
                                                }
                                        }
                                        // Re-seek the ext iterator if the ext iterator changed keys and:
                                        // 1) the SST iterator did not change keys, and we need to bring the ext
                                        //    iterator back.
                                        // 2) the ext iterator became invalid
                                        // 3) both iterators changed keys.
                                        if sstOK && extChangedKeys {
                                                extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                                extOK, extErr = extIter.Valid()
                                        }
                                }
                                // If both iterators are invalid, we are now done. If one both iterators
                                // are at point keys under the same MVCC key, then we can check for
                                // conflicts between them.
                        }
                }
                if !sstOK && extOK && !sstPrevRangeKeys.IsEmpty() {
                        // If the SST iter previously had a range key, it's possible that the
                        // ext iter has future range keys that we have yet to process. Check
                        // if that's the case.
                        if !steppedExtIter {
                                extIter.NextKey()
                        }
                        extOK, extErr = extIter.Valid()
                        if extOK {
                                sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key})
                                sstOK, sstErr = sstIter.Valid()
                                if sstOK {
                                        // This SeekGE is purely to maintain the extIter > sstIter invariant
                                        // as in most cases it'll be a no-op.
                                        extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key})
                                        extOK, extErr = extIter.Valid()
                                }
                        }
                }
                // Handle case where the ext iter has a range key that we could possibly
                // merge into, but the sst iter has been exhausted.
                if !sstOK && extOK && !sstPrevRangeKeys.IsEmpty() {
                        _, extHasRange = extIter.HasPointAndRange()
                        if extHasRange && sstPrevRangeKeys.CanMergeRight(extIter.RangeKeys()) {
                                statsDiff.Add(updateStatsOnRangeKeyMerge(sstPrevRangeKeys.Bounds.EndKey, sstPrevRangeKeys.Versions))
                        }
                }
        }
        // Handle case where there's an ext range key behind the last sst range key,
        // that was also not processed in the loop itself (i.e. sstPrevRangeKeys !=
        // sstIter.RangeKeys()).
        if sstOK && !extOK {
                _, sstHasRange := sstIter.HasPointAndRange()
                if sstHasRange {
                        sstRangeKeys := sstIter.RangeKeys()
                        if !sstRangeKeys.Bounds.Equal(sstPrevRangeKeys.Bounds) {
                                pos, peekedExtRangeKeys, err := PeekRangeKeysLeft(extIter, sstRangeKeys.Bounds.Key)
                                if err != nil {
                                        return enginepb.MVCCStats{}, err
                                }
                                if pos == 0 && peekedExtRangeKeys.CanMergeRight(sstRangeKeys) {
                                        statsDiff.Add(updateStatsOnRangeKeyMerge(sstRangeKeys.Bounds.Key, sstRangeKeys.Versions))
                                }
                        }
                }
        }

        if extErr != nil {
                return enginepb.MVCCStats{}, extErr
        }
        if sstErr != nil {
                return enginepb.MVCCStats{}, sstErr
        }

        return statsDiff, nil
}

// UpdateSSTTimestamps replaces all MVCC timestamp in the provided SST to the
// given timestamp. All keys must already have the given "from" timestamp.
func UpdateSSTTimestamps(
        ctx context.Context,
        st *cluster.Settings,
        sst []byte,
        from, to hlc.Timestamp,
        concurrency int,
        stats *enginepb.MVCCStats,
) ([]byte, enginepb.MVCCStats, error) {
        if from.IsEmpty() {
                return nil, enginepb.MVCCStats{}, errors.Errorf("from timestamp not given")
        }
        if to.IsEmpty() {
                return nil, enginepb.MVCCStats{}, errors.Errorf("to timestamp not given")
        }

        sstOut := &MemObject{}
        sstOut.Buffer.Grow(len(sst))

        var statsDelta enginepb.MVCCStats
        if stats != nil {
                // There could be a GCBytesAge delta between the old and new timestamps.
                // Calculate this delta by subtracting all the relevant stats at the
                // old timestamp, and then aging the stats to the new timestamp before
                // zeroing the stats again.
                // TODO(nvanbenschoten): should this just be using MVCCStats.Add and
                // MVCCStats.Subtract?
                statsDelta.AgeTo(from.WallTime)
                statsDelta.KeyBytes -= stats.KeyBytes
                statsDelta.ValBytes -= stats.ValBytes
                statsDelta.RangeKeyBytes -= stats.RangeKeyBytes
                statsDelta.RangeValBytes -= stats.RangeValBytes
                statsDelta.LiveBytes -= stats.LiveBytes
                statsDelta.IntentBytes -= stats.IntentBytes
                statsDelta.IntentCount -= stats.IntentCount
                statsDelta.LockBytes -= stats.LockBytes
                statsDelta.LockCount -= stats.LockCount
                statsDelta.AgeTo(to.WallTime)
                statsDelta.KeyBytes += stats.KeyBytes
                statsDelta.ValBytes += stats.ValBytes
                statsDelta.RangeKeyBytes += stats.RangeKeyBytes
                statsDelta.RangeValBytes += stats.RangeValBytes
                statsDelta.LiveBytes += stats.LiveBytes
                statsDelta.IntentBytes += stats.IntentBytes
                statsDelta.IntentCount += stats.IntentCount
                statsDelta.LockBytes += stats.LockBytes
                statsDelta.LockCount += stats.LockCount
        }

        // Fancy optimized Pebble SST rewriter.
        if concurrency > 0 {
                defaults := DefaultPebbleOptions()
                opts := defaults.MakeReaderOptions()
                if fp := defaults.Levels[0].FilterPolicy; fp != nil && len(opts.Filters) == 0 {
                        opts.Filters = map[string]sstable.FilterPolicy{fp.Name(): fp}
                }
                rewriteOpts, minTableFormat := makeSSTRewriteOptions(ctx, st)
                _, tableFormat, err := sstable.RewriteKeySuffixesAndReturnFormat(sst,
                        opts,
                        sstOut,
                        rewriteOpts,
                        EncodeMVCCTimestampSuffix(from),
                        EncodeMVCCTimestampSuffix(to),
                        concurrency,
                )
                if err != nil {
                        return nil, enginepb.MVCCStats{}, err
                }
                if minTableFormat > tableFormat {
                        return nil, enginepb.MVCCStats{},
                                errors.Errorf("rewrite table format %s is less than min format %s",
                                        redact.SafeString(tableFormat.String()), redact.SafeString(minTableFormat.String()))
                }
                return sstOut.Bytes(), statsDelta, nil
        }

        // Naïve read/write loop.
        writer := MakeIngestionSSTWriter(ctx, st, sstOut)
        defer writer.Close()

        // Rewrite point keys.
        iter, err := NewMemSSTIterator(sst, false /* verify */, IterOptions{
                KeyTypes:   IterKeyTypePointsOnly,
                LowerBound: keys.MinKey,
                UpperBound: keys.MaxKey,
        })
        if err != nil {
                return nil, enginepb.MVCCStats{}, err
        }
        defer iter.Close()

        for iter.SeekGE(MVCCKey{Key: keys.MinKey}); ; iter.Next() {
                if ok, err := iter.Valid(); err != nil {
                        return nil, enginepb.MVCCStats{}, err
                } else if !ok {
                        break
                }
                key := iter.UnsafeKey()
                if key.Timestamp != from {
                        return nil, enginepb.MVCCStats{}, errors.Errorf("unexpected timestamp %s (expected %s) for key %s",
                                key.Timestamp, from, key.Key)
                }
                v, err := iter.UnsafeValue()
                if err != nil {
                        return nil, enginepb.MVCCStats{}, err
                }
                err = writer.PutRawMVCC(MVCCKey{Key: key.Key, Timestamp: to}, v)
                if err != nil {
                        return nil, enginepb.MVCCStats{}, err
                }
        }

        // Rewrite range keys.
        iter, err = NewMemSSTIterator(sst, false /* verify */, IterOptions{
                KeyTypes:   IterKeyTypeRangesOnly,
                LowerBound: keys.MinKey,
                UpperBound: keys.MaxKey,
        })
        if err != nil {
                return nil, enginepb.MVCCStats{}, err
        }
        defer iter.Close()

        for iter.SeekGE(MVCCKey{Key: keys.MinKey}); ; iter.Next() {
                if ok, err := iter.Valid(); err != nil {
                        return nil, enginepb.MVCCStats{}, err
                } else if !ok {
                        break
                }
                rangeKeys := iter.RangeKeys()
                for _, v := range rangeKeys.Versions {
                        if v.Timestamp != from {
                                return nil, enginepb.MVCCStats{}, errors.Errorf("unexpected timestamp %s (expected %s) for range key %s",
                                        v.Timestamp, from, rangeKeys.Bounds)
                        }
                        v.Timestamp = to
                        if err = writer.PutRawMVCCRangeKey(rangeKeys.AsRangeKey(v), v.Value); err != nil {
                                return nil, enginepb.MVCCStats{}, err
                        }
                }
        }

        if err = writer.Finish(); err != nil {
                return nil, enginepb.MVCCStats{}, err
        }

        return sstOut.Bytes(), statsDelta, nil
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "bytes"
        "context"
        "io"

        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/settings"
        "github.com/cockroachdb/cockroach/pkg/settings/cluster"
        "github.com/cockroachdb/cockroach/pkg/util/metamorphic"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
        "github.com/cockroachdb/pebble/objstorage"
        "github.com/cockroachdb/pebble/rangekey"
        "github.com/cockroachdb/pebble/sstable"
)

// IngestionValueBlocksEnabled controls whether older versions of MVCC keys in
// the same ingested sstable will have their values written to value blocks.
// This configuration ability was motivated by a case of > 130GB sstables,
// caused by snapshot ingestion. Writing value blocks requires in-memory
// buffering of compressed value blocks, which caused OOMs in the above case.
var IngestionValueBlocksEnabled = settings.RegisterBoolSetting(
        settings.ApplicationLevel,
        "storage.ingestion.value_blocks.enabled",
        "set to true to enable writing of value blocks in ingestion sstables",
        metamorphic.ConstantWithTestBool(
                "storage.ingestion.value_blocks.enabled", true),
        settings.WithPublic)

// SSTWriter writes SSTables.
type SSTWriter struct {
        fw *sstable.Writer
        // DataSize tracks the total key and value bytes added so far.
        DataSize int64
        scratch  []byte

        Meta *sstable.WriterMetadata
}

var _ Writer = &SSTWriter{}
var _ ExportWriter = &SSTWriter{}
var _ InternalWriter = &SSTWriter{}

// NoopFinishAbortWritable wraps an io.Writer to make a objstorage.Writable that
// will ignore Finish and Abort calls.
func NoopFinishAbortWritable(w io.Writer) objstorage.Writable {
        return &noopFinishAbort{Writer: w}
}

// noopFinishAbort is used to wrap io.Writers for sstable.Writer.
type noopFinishAbort struct {
        io.Writer
}

var _ objstorage.Writable = (*noopFinishAbort)(nil)

// Write is part of the objstorage.Writable interface.
func (n *noopFinishAbort) Write(p []byte) error {
        // An io.Writer always returns an error if it can't write the entire slice.
        _, err := n.Writer.Write(p)
        return err
}

// Finish is part of the objstorage.Writable interface.
func (*noopFinishAbort) Finish() error {
        return nil
}

// Abort is part of the objstorage.Writable interface.
func (*noopFinishAbort) Abort() {}

// MakeIngestionWriterOptions returns writer options suitable for writing SSTs
// that will subsequently be ingested (e.g. with AddSSTable). These options are
// also used when constructing sstables for backups (because these sstables may
// ultimately be ingested during online restore).
func MakeIngestionWriterOptions(ctx context.Context, cs *cluster.Settings) sstable.WriterOptions {
        // All supported versions understand TableFormatPebblev4. If columnar blocks
        // are enabled and the active cluster version is at least 24.3, use
        // TableFormatPebblev5.
        format := sstable.TableFormatPebblev4
        if ColumnarBlocksEnabled.Get(&cs.SV) {
                format = sstable.TableFormatPebblev5
        }

        opts := DefaultPebbleOptions().MakeWriterOptions(0, format)
        // By default, compress with the algorithm used for storage in a Pebble store.
        // There are other, more specific, use cases that may call for a different
        // algorithm, which can be set by overriding the default (see
        // MakeIngestionSSTWriterWithOverrides).
        opts.Compression = getCompressionAlgorithm(ctx, cs, CompressionAlgorithmStorage)
        opts.MergerName = "nullptr"
        if !IngestionValueBlocksEnabled.Get(&cs.SV) {
                opts.DisableValueBlocks = true
        }
        return opts
}

// makeSSTRewriteOptions should be used instead of MakeIngestionWriterOptions
// when we are going to rewrite ssts. It additionally returns the minimum
// table format that we accept, since sst rewriting will often preserve the
// input table format.
func makeSSTRewriteOptions(
        ctx context.Context, cs *cluster.Settings,
) (opts sstable.WriterOptions, minTableFormat sstable.TableFormat) {
        // v22.2 clusters use sstable.TableFormatPebblev2.
        return MakeIngestionWriterOptions(ctx, cs), sstable.TableFormatPebblev2
}

// MakeTransportSSTWriter creates a new SSTWriter tailored for sstables
// constructed exclusively for transport, which are typically only ever iterated
// in their entirety and not durably persisted. At the time of writing, this is
// used by export requests. During a backup, the export requests will construct
// sstables using this writer, those sstables will be sent over the network,
// scanned and their keys inserted into new sstables (NB: constructed using
// MakeIngestionSSTWriter) that ultimately are uploaded to object storage.
func MakeTransportSSTWriter(ctx context.Context, cs *cluster.Settings, f io.Writer) SSTWriter {
        // By default, take a conservative approach and assume we don't have newer
        // table features available. Upgrade to an appropriate version only if the
        // cluster supports it.
        format := sstable.TableFormatPebblev4
        if ColumnarBlocksEnabled.Get(&cs.SV) {
                format = sstable.TableFormatPebblev5
        }

        opts := DefaultPebbleOptions().MakeWriterOptions(0, format)

        // Don't need value blocks.
        opts.DisableValueBlocks = true
        // Don't need BlockPropertyCollectors for backups.
        opts.BlockPropertyCollectors = nil
        // Disable bloom filters since we only ever iterate backups.
        opts.FilterPolicy = nil
        // Bump up block size, since we almost never seek or do point lookups, so more
        // block checksums and more index entries are just overhead and smaller blocks
        // reduce compression ratio.
        opts.BlockSize = 128 << 10
        opts.Compression = getCompressionAlgorithm(ctx, cs, CompressionAlgorithmBackupTransport)
        opts.MergerName = "nullptr"
        return SSTWriter{
                fw: sstable.NewWriter(&noopFinishAbort{f}, opts),
        }
}

// MakeIngestionSSTWriter creates a new SSTWriter tailored for ingestion SSTs.
// These SSTs have bloom filters enabled (as set in DefaultPebbleOptions). If
// the cluster settings permit value blocks, the SST may contain value blocks.
// This writer is used when constructing sstables for backups too, because
// backup sstables may ultimately be ingested during online restore.
func MakeIngestionSSTWriter(
        ctx context.Context, cs *cluster.Settings, w objstorage.Writable,
) SSTWriter {
        return MakeIngestionSSTWriterWithOverrides(ctx, cs, w)
}

// SSTWriterOption augments one or more sstable.WriterOptions.
type SSTWriterOption func(opts *sstable.WriterOptions)

// WithValueBlocksDisabled disables the use of value blocks in an SSTable.
var WithValueBlocksDisabled SSTWriterOption = func(opts *sstable.WriterOptions) {
        opts.DisableValueBlocks = true
}

// WithCompressionFromClusterSetting sets the compression algorithm for an
// SSTable based on the value of the given cluster setting.
func WithCompressionFromClusterSetting(
        ctx context.Context, cs *cluster.Settings, setting *settings.EnumSetting[compressionAlgorithm],
) SSTWriterOption {
        return func(opts *sstable.WriterOptions) {
                opts.Compression = getCompressionAlgorithm(ctx, cs, setting)
        }
}

// MakeIngestionSSTWriterWithOverrides creates a new SSTWriter tailored for
// ingestion SSTs. Note that writer is used when constructing sstables for
// backups, because backup sstables may ultimately be ingested during online
// restore.
//
// These SSTs have bloom filters enabled (as set in DefaultPebbleOptions) and
// format set to the highest permissible by the cluster settings. Callers that
// expect to write huge SSTs, say 200+MB, which could contain multiple versions
// for the same key, should pass in a WithValueBlocksDisabled option. This is
// because value blocks are buffered in-memory while writing the SST (see
// https://github.com/cockroachdb/cockroach/issues/117113).
func MakeIngestionSSTWriterWithOverrides(
        ctx context.Context, cs *cluster.Settings, w objstorage.Writable, overrides ...SSTWriterOption,
) SSTWriter {
        opts := MakeIngestionWriterOptions(ctx, cs)
        for _, o := range overrides {
                o(&opts)
        }
        return SSTWriter{
                fw: sstable.NewWriter(w, opts),
        }
}

// Finish finalizes the writer and returns the constructed file's contents,
// since the last call to Truncate (if any). At least one kv entry must have been added.
func (fw *SSTWriter) Finish() error {
        if fw.fw == nil {
                return errors.New("cannot call Finish on a closed writer")
        }
        if err := fw.fw.Close(); err != nil {
                return err
        }
        var err error
        fw.Meta, err = fw.fw.Raw().Metadata()
        fw.fw = nil
        return err
}

// ClearRawRange implements the Engine interface.
func (fw *SSTWriter) ClearRawRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error {
        fw.scratch = EngineKey{Key: start}.EncodeToBuf(fw.scratch[:0])
        endRaw := EngineKey{Key: end}.Encode()
        if pointKeys {
                fw.DataSize += int64(len(start)) + int64(len(end))
                if err := fw.fw.DeleteRange(fw.scratch, endRaw); err != nil {
                        return err
                }
        }
        if rangeKeys {
                fw.DataSize += int64(len(start)) + int64(len(end))
                if err := fw.fw.RangeKeyDelete(fw.scratch, endRaw); err != nil {
                        return err
                }
        }
        return nil
}

// ClearMVCCRange implements the Writer interface.
func (fw *SSTWriter) ClearMVCCRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error {
        panic("not implemented")
}

// ClearMVCCVersions implements the Writer interface.
func (fw *SSTWriter) ClearMVCCVersions(start, end MVCCKey) error {
        return fw.clearRange(start, end)
}

// PutMVCCRangeKey implements the Writer interface.
func (fw *SSTWriter) PutMVCCRangeKey(rangeKey MVCCRangeKey, value MVCCValue) error {
        // NB: all MVCC APIs currently assume all range keys are range tombstones.
        if !value.IsTombstone() {
                return errors.New("range keys can only be MVCC range tombstones")
        }
        valueRaw, err := EncodeMVCCValue(value)
        if err != nil {
                return errors.Wrapf(err, "failed to encode MVCC value for range key %s", rangeKey)
        }
        return fw.PutRawMVCCRangeKey(rangeKey, valueRaw)
}

// PutRawMVCCRangeKey implements the Writer interface.
func (fw *SSTWriter) PutRawMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error {
        if err := rangeKey.Validate(); err != nil {
                return err
        }
        return fw.PutEngineRangeKey(
                rangeKey.StartKey, rangeKey.EndKey, EncodeMVCCTimestampSuffix(rangeKey.Timestamp), value)
}

// ClearMVCCRangeKey implements the Writer interface.
func (fw *SSTWriter) ClearMVCCRangeKey(rangeKey MVCCRangeKey) error {
        if err := rangeKey.Validate(); err != nil {
                return err
        }
        // If the range key holds an encoded timestamp as it was read from storage,
        // write the tombstone to clear it using the same encoding of the timestamp.
        // See #129592.
        if len(rangeKey.EncodedTimestampSuffix) > 0 {
                return fw.ClearEngineRangeKey(rangeKey.StartKey, rangeKey.EndKey,
                        rangeKey.EncodedTimestampSuffix)
        }
        return fw.ClearEngineRangeKey(rangeKey.StartKey, rangeKey.EndKey,
                EncodeMVCCTimestampSuffix(rangeKey.Timestamp))
}

// PutEngineRangeKey implements the Writer interface.
func (fw *SSTWriter) PutEngineRangeKey(start, end roachpb.Key, suffix, value []byte) error {
        // MVCC values don't account for the timestamp, so we don't account
        // for the suffix here.
        fw.DataSize += int64(len(start)) + int64(len(end)) + int64(len(value))
        return fw.fw.RangeKeySet(
                EngineKey{Key: start}.Encode(), EngineKey{Key: end}.Encode(), suffix, value)
}

// ClearEngineRangeKey implements the Writer interface.
func (fw *SSTWriter) ClearEngineRangeKey(start, end roachpb.Key, suffix []byte) error {
        // MVCC values don't account for the timestamp, so we don't account for the
        // suffix here.
        fw.DataSize += int64(len(start)) + int64(len(end))
        return fw.fw.RangeKeyUnset(EngineKey{Key: start}.Encode(), EngineKey{Key: end}.Encode(), suffix)
}

// ClearEngineRange clears point keys in the specified EngineKey range.
func (fw *SSTWriter) ClearEngineRange(start, end EngineKey) error {
        fw.scratch = start.EncodeToBuf(fw.scratch[:0])
        endRaw := end.Encode()
        fw.DataSize += int64(len(start.Key)) + int64(len(end.Key))
        if err := fw.fw.DeleteRange(fw.scratch, endRaw); err != nil {
                return err
        }
        return nil
}

// ClearRawEncodedRange implements the InternalWriter interface.
func (fw *SSTWriter) ClearRawEncodedRange(start, end []byte) error {
        startEngine, ok := DecodeEngineKey(start)
        if !ok {
                return errors.New("cannot decode start engine key")
        }
        endEngine, ok := DecodeEngineKey(end)
        if !ok {
                return errors.New("cannot decode end engine key")
        }
        fw.DataSize += int64(len(startEngine.Key)) + int64(len(endEngine.Key))
        return fw.fw.DeleteRange(start, end)
}

// PutInternalRangeKey implements the InternalWriter interface.
func (fw *SSTWriter) PutInternalRangeKey(start, end []byte, key rangekey.Key) error {
        startEngine, ok := DecodeEngineKey(start)
        if !ok {
                return errors.New("cannot decode engine key")
        }
        endEngine, ok := DecodeEngineKey(end)
        if !ok {
                return errors.New("cannot decode engine key")
        }
        fw.DataSize += int64(len(startEngine.Key)) + int64(len(endEngine.Key)) + int64(len(key.Value))
        switch key.Kind() {
        case pebble.InternalKeyKindRangeKeyUnset:
                return fw.fw.RangeKeyUnset(start, end, key.Suffix)
        case pebble.InternalKeyKindRangeKeySet:
                return fw.fw.RangeKeySet(start, end, key.Suffix, key.Value)
        case pebble.InternalKeyKindRangeKeyDelete:
                return fw.fw.RangeKeyDelete(start, end)
        default:
                panic("unexpected range key kind")
        }
}

// PutInternalPointKey implements the InternalWriter interface.
func (fw *SSTWriter) PutInternalPointKey(key *pebble.InternalKey, value []byte) error {
        ek, ok := DecodeEngineKey(key.UserKey)
        if !ok {
                return errors.New("cannot decode engine key")
        }
        fw.DataSize += int64(len(ek.Key)) + int64(len(value))
        return fw.fw.Raw().AddWithForceObsolete(*key, value, false /* forceObsolete */)
}

// clearRange clears all point keys in the given range by dropping a Pebble
// range tombstone.
//
// NB: Does not clear range keys.
func (fw *SSTWriter) clearRange(start, end MVCCKey) error {
        if fw.fw == nil {
                return errors.New("cannot call ClearRange on a closed writer")
        }
        fw.DataSize += int64(len(start.Key)) + int64(len(end.Key))
        fw.scratch = EncodeMVCCKeyToBuf(fw.scratch[:0], start)
        return fw.fw.DeleteRange(fw.scratch, EncodeMVCCKey(end))
}

// Put puts a kv entry into the sstable being built. An error is returned if it
// is not greater than any previously added entry (according to the comparator
// configured during writer creation). `Close` cannot have been called.
//
// TODO(sumeer): Put has been removed from the Writer interface, but there
// are many callers of this SSTWriter method. Fix those callers and remove.
func (fw *SSTWriter) Put(key MVCCKey, value []byte) error {
        if fw.fw == nil {
                return errors.New("cannot call Put on a closed writer")
        }
        fw.DataSize += int64(len(key.Key)) + int64(len(value))
        fw.scratch = EncodeMVCCKeyToBuf(fw.scratch[:0], key)
        return fw.fw.Set(fw.scratch, value)
}

// PutMVCC implements the Writer interface.
// An error is returned if it is not greater than any previously added entry
// (according to the comparator configured during writer creation). `Close`
// cannot have been called.
func (fw *SSTWriter) PutMVCC(key MVCCKey, value MVCCValue) error {
        if key.Timestamp.IsEmpty() {
                panic("PutMVCC timestamp is empty")
        }
        encValue, err := EncodeMVCCValue(value)
        if err != nil {
                return err
        }
        return fw.put(key, encValue)
}

// PutRawMVCC implements the Writer interface.
// An error is returned if it is not greater than any previously added entry
// (according to the comparator configured during writer creation). `Close`
// cannot have been called.
func (fw *SSTWriter) PutRawMVCC(key MVCCKey, value []byte) error {
        if key.Timestamp.IsEmpty() {
                panic("PutRawMVCC timestamp is empty")
        }
        return fw.put(key, value)
}

// PutUnversioned implements the Writer interface.
// An error is returned if it is not greater than any previously added entry
// (according to the comparator configured during writer creation). `Close`
// cannot have been called.
func (fw *SSTWriter) PutUnversioned(key roachpb.Key, value []byte) error {
        return fw.put(MVCCKey{Key: key}, value)
}

// PutEngineKey implements the Writer interface.
// An error is returned if it is not greater than any previously added entry
// (according to the comparator configured during writer creation). `Close`
// cannot have been called.
func (fw *SSTWriter) PutEngineKey(key EngineKey, value []byte) error {
        if fw.fw == nil {
                return errors.New("cannot call Put on a closed writer")
        }
        fw.DataSize += int64(len(key.Key)) + int64(len(value))
        fw.scratch = key.EncodeToBuf(fw.scratch[:0])
        return fw.fw.Set(fw.scratch, value)
}

// put puts a kv entry into the sstable being built. An error is returned if it
// is not greater than any previously added entry (according to the comparator
// configured during writer creation). `Close` cannot have been called.
func (fw *SSTWriter) put(key MVCCKey, value []byte) error {
        if fw.fw == nil {
                return errors.New("cannot call Put on a closed writer")
        }
        fw.DataSize += int64(len(key.Key)) + int64(len(value))
        fw.scratch = EncodeMVCCKeyToBuf(fw.scratch[:0], key)
        return fw.fw.Set(fw.scratch, value)
}

// ApplyBatchRepr implements the Writer interface.
func (fw *SSTWriter) ApplyBatchRepr(repr []byte, sync bool) error {
        panic("unimplemented")
}

// ClearMVCC implements the Writer interface. An error is returned if it is
// not greater than any previous point key passed to this Writer (according to
// the comparator configured during writer creation). `Close` cannot have been
// called.
func (fw *SSTWriter) ClearMVCC(key MVCCKey, opts ClearOptions) error {
        if key.Timestamp.IsEmpty() {
                panic("ClearMVCC timestamp is empty")
        }
        return fw.clear(key, opts)
}

// ClearUnversioned implements the Writer interface. An error is returned if
// it is not greater than any previous point key passed to this Writer
// (according to the comparator configured during writer creation). `Close`
// cannot have been called.
func (fw *SSTWriter) ClearUnversioned(key roachpb.Key, opts ClearOptions) error {
        return fw.clear(MVCCKey{Key: key}, opts)
}

// ClearEngineKey implements the Writer interface. An error is returned if it is
// not greater than any previous point key passed to this Writer (according to
// the comparator configured during writer creation). `Close` cannot have been
// called.
func (fw *SSTWriter) ClearEngineKey(key EngineKey, opts ClearOptions) error {
        if fw.fw == nil {
                return errors.New("cannot call Clear on a closed writer")
        }
        fw.scratch = key.EncodeToBuf(fw.scratch[:0])
        fw.DataSize += int64(len(key.Key))
        // TODO(jackson): We could use opts.ValueSize if known, but it would require
        // additional logic around ensuring the cluster version is at least
        // V23_2_UseSizedPebblePointTombstones. It's probably not worth it until we
        // can unconditionally use it; I don't believe we ever write point
        // tombstones to sstables constructed within Cockroach.
        return fw.fw.Delete(fw.scratch)
}

// An error is returned if it is not greater than any previous point key
// passed to this Writer (according to the comparator configured during writer
// creation). `Close` cannot have been called.
func (fw *SSTWriter) clear(key MVCCKey, opts ClearOptions) error {
        if fw.fw == nil {
                return errors.New("cannot call Clear on a closed writer")
        }
        fw.scratch = EncodeMVCCKeyToBuf(fw.scratch[:0], key)
        fw.DataSize += int64(len(key.Key))
        // TODO(jackson): We could use opts.ValueSize if known, but it would require
        // additional logic around ensuring the cluster version is at least
        // V23_2_UseSizedPebblePointTombstones. It's probably not worth it until we
        // can unconditionally use it; I don't believe we ever write point
        // tombstones to sstables constructed within Cockroach.
        return fw.fw.Delete(fw.scratch)
}

// SingleClearEngineKey implements the Writer interface.
func (fw *SSTWriter) SingleClearEngineKey(key EngineKey) error {
        panic("unimplemented")
}

// ClearMVCCIteratorRange implements the Writer interface.
func (fw *SSTWriter) ClearMVCCIteratorRange(_, _ roachpb.Key, _, _ bool) error {
        panic("not implemented")
}

// Merge implements the Writer interface.
func (fw *SSTWriter) Merge(key MVCCKey, value []byte) error {
        if fw.fw == nil {
                return errors.New("cannot call Merge on a closed writer")
        }
        fw.DataSize += int64(len(key.Key)) + int64(len(value))
        fw.scratch = EncodeMVCCKeyToBuf(fw.scratch[:0], key)
        return fw.fw.Merge(fw.scratch, value)
}

// LogData implements the Writer interface.
func (fw *SSTWriter) LogData(data []byte) error {
        // No-op.
        return nil
}

// LogLogicalOp implements the Writer interface.
func (fw *SSTWriter) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
        // No-op.
}

// Close finishes and frees memory and other resources. Close is idempotent.
func (fw *SSTWriter) Close() {
        if fw.fw == nil {
                return
        }
        // pebble.Writer *does* return interesting errors from Close... but normally
        // we already called its Close() in Finish() and we no-op here. Thus the only
        // time we expect to be here is in a deferred Close(), in which case the caller
        // probably is already returning some other error, so returning one from this
        // method just makes for messy defers.
        _ = fw.fw.Close()
        fw.fw = nil
}

// ShouldWriteLocalTimestamps implements the Writer interface.
func (fw *SSTWriter) ShouldWriteLocalTimestamps(context.Context) bool {
        return false
}

// BufferedSize implements the Writer interface.
func (fw *SSTWriter) BufferedSize() int {
        return 0
}

// EstimatedSize returns the underlying RawWriter's estimated size. Note that
// this size is an estimate as if the writer were to be closed at the time of
// calling.
func (fw *SSTWriter) EstimatedSize() uint64 {
        return fw.fw.Raw().EstimatedSize()
}

// MemObject is an in-memory implementation of objstorage.Writable, intended
// use with SSTWriter.
type MemObject struct {
        bytes.Buffer
}

var _ objstorage.Writable = (*MemObject)(nil)

// Write is part of the objstorage.Writable interface.
func (f *MemObject) Write(p []byte) error {
        _, err := f.Buffer.Write(p)
        return err
}

// Finish is part of the objstorage.Writable interface.
func (*MemObject) Finish() error {
        return nil
}

// Abort is part of the objstorage.Writable interface.
func (*MemObject) Abort() {}

// Close implements the writeCloseSyncer interface.
func (*MemObject) Close() error {
        return nil
}

// Data returns the in-memory buffer behind this MemObject.
func (f *MemObject) Data() []byte {
        return f.Bytes()
}

// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"
        "path/filepath"
        "strings"

        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/util/log"
        "github.com/elastic/gosigar"
)

func computeStoreProperties(ctx context.Context, cfg engineConfig) roachpb.StoreProperties {
        props := roachpb.StoreProperties{
                Dir:       cfg.env.Dir,
                ReadOnly:  cfg.env.IsReadOnly(),
                Encrypted: cfg.env.Encryption != nil,
        }
        if cfg.opts.WALFailover != nil {
                props.WalFailoverPath = new(string)
                *props.WalFailoverPath = cfg.opts.WALFailover.Secondary.Dirname
        }

        // In-memory store?
        if cfg.env.Dir == "" {
                return props
        }

        fsprops := getFileSystemProperties(ctx, cfg.env.Dir)
        props.FileStoreProperties = &fsprops
        return props
}

func getFileSystemProperties(ctx context.Context, dir string) roachpb.FileStoreProperties {
        fsprops := roachpb.FileStoreProperties{
                Path: dir,
        }

        // Find which filesystem supports the store.

        absPath, err := filepath.Abs(dir)
        if err != nil {
                log.Warningf(ctx, "cannot compute absolute file path for %q: %v", dir, err)
                return fsprops
        }

        // Alas, only BSD reliably populates "fs" in os.StatFs(),
        // so we must find the filesystem manually.
        //
        // Note that scanning the list of mounts is also
        // what linux' df(1) command does.
        //
        var fslist gosigar.FileSystemList
        if err := fslist.Get(); err != nil {
                log.Warningf(ctx, "cannot retrieve filesystem list: %v", err)
                return fsprops
        }

        var fsInfo *gosigar.FileSystem
        // We're reading the list of mounts in reverse order: we're assuming
        // that mounts are LIFO and can only be stacked, so the best match
        // will necessarily be the first filesystem that's a prefix of the
        // target directory, when looking from the end of the file.
        //
        // TODO(ssd): Steven points out that gosigar reads from /etc/mtab on
        // linux, which is sometimes managed by the user command 'mount' and
        // can sometimes miss entries when `mount -n` is used. It might be
        // better to change gosigar to use /proc/mounts instead.
        //
        // FWIW, we are OK with this for now, since the systems where crdb
        // is typically being deployed are well-behaved in that regard:
        // Kubernetes mirrors /proc/mount in /etc/mtab.
        for i := len(fslist.List) - 1; i >= 0; i-- {
                if pathIsInside(fslist.List[i].DirName, absPath) {
                        fsInfo = &fslist.List[i]
                        break
                }
        }
        if fsInfo == nil {
                // This is surprising!? We're expecting at least a match on the
                // root filesystem. Oh well.
                return fsprops
        }

        fsprops.FsType = fsInfo.SysTypeName
        fsprops.BlockDevice = fsInfo.DevName
        fsprops.MountPoint = fsInfo.DirName
        fsprops.MountOptions = fsInfo.Options
        return fsprops
}

// pathIsInside returns true if the absolute target path is inside a base path.
func pathIsInside(basePath string, absTargetPath string) bool {
        // filepath.Rel can reliably tell us if a path is relative to
        // another: if it is not, an error is returned.
        relPath, err := filepath.Rel(basePath, absTargetPath)
        if err != nil {
                return false
        }
        if strings.HasPrefix(relPath, "..") {
                // This check is consistent with internal filepath code (like isLocal).
                if len(relPath) == 2 || relPath[2] == filepath.Separator {
                        return false
                }
        }
        return true
}

// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import (
        "context"

        "github.com/cockroachdb/cockroach/pkg/base"
        "github.com/cockroachdb/cockroach/pkg/kv/kvserver/diskmap"
        "github.com/cockroachdb/cockroach/pkg/storage/disk"
        "github.com/cockroachdb/cockroach/pkg/storage/fs"
        "github.com/cockroachdb/cockroach/pkg/util/log"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/pebble"
        "github.com/cockroachdb/pebble/vfs"
)

// NewTempEngine creates a new engine for DistSQL processors to use when
// the working set is larger than can be stored in memory.
func NewTempEngine(
        ctx context.Context,
        tempStorage base.TempStorageConfig,
        storeSpec base.StoreSpec,
        diskWriteStats disk.WriteStatsManager,
) (diskmap.Factory, vfs.FS, error) {
        return NewPebbleTempEngine(ctx, tempStorage, storeSpec, diskWriteStats)
}

type pebbleTempEngine struct {
        db        *pebble.DB
        closeFunc func()
}

// Close implements the diskmap.Factory interface.
func (r *pebbleTempEngine) Close() {
        if err := r.db.Close(); err != nil {
                log.Fatalf(context.TODO(), "%v", err)
        }
        r.closeFunc()
}

// NewSortedDiskMap implements the diskmap.Factory interface.
func (r *pebbleTempEngine) NewSortedDiskMap() diskmap.SortedDiskMap {
        return newPebbleMap(r.db, false /* allowDuplications */)
}

// NewSortedDiskMultiMap implements the diskmap.Factory interface.
func (r *pebbleTempEngine) NewSortedDiskMultiMap() diskmap.SortedDiskMap {
        return newPebbleMap(r.db, true /* allowDuplicates */)
}

// NewPebbleTempEngine creates a new Pebble engine for DistSQL processors to use
// when the working set is larger than can be stored in memory.
func NewPebbleTempEngine(
        ctx context.Context,
        tempStorage base.TempStorageConfig,
        storeSpec base.StoreSpec,
        diskWriteStats disk.WriteStatsManager,
) (diskmap.Factory, vfs.FS, error) {
        return newPebbleTempEngine(ctx, tempStorage, storeSpec, diskWriteStats)
}

func newPebbleTempEngine(
        ctx context.Context,
        tempStorage base.TempStorageConfig,
        storeSpec base.StoreSpec,
        diskWriteStats disk.WriteStatsManager,
) (*pebbleTempEngine, vfs.FS, error) {
        var baseFS vfs.FS
        var dir string
        var cacheSize int64 = 128 << 20 // 128 MiB, arbitrary, but not "too big"
        if tempStorage.InMemory {
                cacheSize = 8 << 20 // 8 MiB, smaller for in-memory, still non-zero
                baseFS = vfs.NewMem()
        } else {
                baseFS = vfs.Default
                dir = tempStorage.Path
        }
        env, err := fs.InitEnv(ctx, baseFS, dir, fs.EnvConfig{
                RW: fs.ReadWrite,
                // Adopt the encryption options of the provided store spec so that
                // temporary data is encrypted if the store is encrypted.
                EncryptionOptions: storeSpec.EncryptionOptions,
        }, diskWriteStats)
        if err != nil {
                return nil, nil, err
        }

        var statsCollector *vfs.DiskWriteStatsCollector
        if diskWriteStats != nil && !tempStorage.InMemory {
                statsCollector, err = diskWriteStats.GetOrCreateCollector(dir)
                if err != nil {
                        return nil, nil, errors.Wrap(err, "retrieving stats collector")
                }
        }

        p, err := Open(ctx, env,
                tempStorage.Settings,
                CacheSize(cacheSize),
                func(cfg *engineConfig) error {
                        // The Pebble temp engine does not use MVCC Encoding. Instead, the
                        // caller-provided key is used as-is (with the prefix prepended). See
                        // pebbleMap.makeKey and pebbleMap.makeKeyWithSequence on how this works.
                        // Use the default bytes.Compare-like comparer.
                        cfg.opts.Comparer = pebble.DefaultComparer
                        cfg.opts.KeySchemas = nil
                        cfg.opts.KeySchema = ""
                        cfg.opts.DisableWAL = true
                        cfg.opts.Experimental.UserKeyCategories = pebble.UserKeyCategories{}
                        cfg.opts.BlockPropertyCollectors = nil
                        cfg.opts.EnableSQLRowSpillMetrics = true
                        cfg.DiskWriteStatsCollector = statsCollector
                        return nil
                },
        )
        if err != nil {
                return nil, nil, err
        }

        // Set store ID for the pebble engine. We are not using shared storage for
        // temp stores so this cannot error out.
        _ = p.SetStoreID(ctx, base.TempStoreID)
        return &pebbleTempEngine{
                db:        p.db,
                closeFunc: env.Close,
        }, env, nil
}

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package storage

import "github.com/cockroachdb/pebble"

// verifyingMVCCIterator is an MVCC iterator that wraps a pebbleIterator and
// verifies roachpb.Value checksums for encountered values.
type verifyingMVCCIterator struct {
        *pebbleIterator // concrete type to avoid dynamic dispatch

        valid    bool
        err      error
        key      MVCCKey
        value    []byte
        hasPoint bool
        hasRange bool
}

// newVerifyingMVCCIterator creates a new VerifyingMVCCIterator.
func newVerifyingMVCCIterator(iter *pebbleIterator) MVCCIterator {
        return &verifyingMVCCIterator{pebbleIterator: iter}
}

// saveAndVerify fetches the current key and value, saves them in the iterator,
// and verifies the value.
func (i *verifyingMVCCIterator) saveAndVerify() {
        if i.valid, i.err = i.pebbleIterator.Valid(); !i.valid || i.err != nil {
                return
        }
        i.key = i.pebbleIterator.UnsafeKey()
        i.hasPoint, i.hasRange = i.pebbleIterator.HasPointAndRange()
        if i.hasPoint {
                i.value, _ = i.pebbleIterator.UnsafeValue()
                if i.key.IsValue() {
                        mvccValue, err := decodeMVCCValueIgnoringHeader(i.value)
                        if err == nil {
                                err = mvccValue.Value.Verify(i.key.Key)
                        }
                        if err != nil {
                                i.err = err
                                i.valid = false
                                return
                        }
                }
        }
}

// Next implements MVCCIterator.
func (i *verifyingMVCCIterator) Next() {
        i.pebbleIterator.Next()
        i.saveAndVerify()
}

// NextKey implements MVCCIterator.
func (i *verifyingMVCCIterator) NextKey() {
        i.pebbleIterator.NextKey()
        i.saveAndVerify()
}

// Prev implements MVCCIterator.
func (i *verifyingMVCCIterator) Prev() {
        i.pebbleIterator.Prev()
        i.saveAndVerify()
}

// SeekGE implements MVCCIterator.
func (i *verifyingMVCCIterator) SeekGE(key MVCCKey) {
        i.pebbleIterator.SeekGE(key)
        i.saveAndVerify()
}

// SeekLT implements MVCCIterator.
func (i *verifyingMVCCIterator) SeekLT(key MVCCKey) {
        i.pebbleIterator.SeekLT(key)
        i.saveAndVerify()
}

// UnsafeKey implements MVCCIterator.
func (i *verifyingMVCCIterator) UnsafeKey() MVCCKey {
        return i.key
}

// UnsafeValue implements MVCCIterator.
func (i *verifyingMVCCIterator) UnsafeValue() ([]byte, error) {
        return i.value, nil
}

// MVCCValueLenAndIsTombstone implements MVCCIterator.
func (i *verifyingMVCCIterator) MVCCValueLenAndIsTombstone() (int, bool, error) {
        isTombstone, err := EncodedMVCCValueIsTombstone(i.value)
        if err != nil {
                return 0, false, err
        }
        return len(i.value), isTombstone, nil
}

// ValueLen implements MVCCIterator.
func (i *verifyingMVCCIterator) ValueLen() int {
        return len(i.value)
}

// Valid implements MVCCIterator.
func (i *verifyingMVCCIterator) Valid() (bool, error) {
        return i.valid, i.err
}

// HasPointAndRange implements MVCCIterator.
func (i *verifyingMVCCIterator) HasPointAndRange() (bool, bool) {
        return i.hasPoint, i.hasRange
}

// UnsafeLazyValue implements MVCCIterator.
func (i *verifyingMVCCIterator) UnsafeLazyValue() pebble.LazyValue {
        return pebble.LazyValue{ValueOrHandle: i.value}
}

// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package skip

// ClampMetamorphicConstantUnderDuress ensures that the given integer constant
// with metamorphic testing range is at least the given minimum value, when the
// process is running under duress.
func ClampMetamorphicConstantUnderDuress(val, min int) int {
        if Duress() && val < min {
                return min
        }
        return val
}

// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package skip

import (
        "flag"
        "fmt"
        "os"
        "strings"
        "testing"

        "github.com/cockroachdb/cockroach/pkg/build/bazel"
        "github.com/cockroachdb/cockroach/pkg/util"
        "github.com/cockroachdb/cockroach/pkg/util/buildutil"
        "github.com/cockroachdb/cockroach/pkg/util/envutil"
        "github.com/cockroachdb/cockroach/pkg/util/metamorphic/metamorphicutil"
        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
)

// SkippableTest is a testing.TB with Skip methods.
type SkippableTest interface {
        Helper()
        Skip(...interface{})
        Skipf(string, ...interface{})
}

// WithIssue skips this test, logging the given issue ID as the reason.
func WithIssue(t SkippableTest, githubIssueID int, args ...interface{}) {
        t.Helper()
        maybeSkip(t, fmt.Sprintf("https://github.com/cockroachdb/cockroach/issues/%d", githubIssueID),
                args...)
}

// Unimplemented skips this test case, loggint the given issue ID. It
// is included in addition to WithIssue to allow the caller to signal
// that this test is not being skipped because of a bug, but rather
// because of an unimplemented feature.
func Unimplemented(t SkippableTest, githubIssueID int, args ...interface{}) {
        t.Helper()
        maybeSkip(t, withIssue("unimplemented", githubIssueID), args...)
}

// IgnoreLint skips this test, explicitly marking it as not a test that
// should be tracked as a "skipped test" by external tools. You should use this
// if, for example, your test should only be run in Race mode.
//
// Does not respect COCKROACH_FORCE_RUN_SKIPPED_TESTS.
func IgnoreLint(t SkippableTest, args ...interface{}) {
        t.Helper()
        t.Skip(args...)
}

// IgnoreLintf is like IgnoreLint, and it also takes a format string.
//
// Does not respect COCKROACH_FORCE_RUN_SKIPPED_TESTS.
func IgnoreLintf(t SkippableTest, format string, args ...interface{}) {
        t.Helper()
        t.Skipf(format, args...)
}

// UnderDeadlock skips this test if the deadlock detector is enabled.
func UnderDeadlock(t SkippableTest, args ...interface{}) {
        t.Helper()
        if syncutil.DeadlockEnabled {
                maybeSkip(t, "disabled under deadlock detector", args...)
        }
}

// UnderDeadlockWithIssue skips this test if the deadlock detector is enabled,
// logging the given issue ID as the reason.
func UnderDeadlockWithIssue(t SkippableTest, githubIssueID int, args ...interface{}) {
        t.Helper()
        if syncutil.DeadlockEnabled {
                maybeSkip(t, withIssue("disabled under deadlock detector", githubIssueID), args...)
        }
}

// UnderRace skips this test if the race detector is enabled.
func UnderRace(t SkippableTest, args ...interface{}) {
        t.Helper()
        if util.RaceEnabled {
                maybeSkip(t, "disabled under race", args...)
        }
}

// UnderRaceWithIssue skips this test if the race detector is enabled,
// logging the given issue ID as the reason.
func UnderRaceWithIssue(t SkippableTest, githubIssueID int, args ...interface{}) {
        t.Helper()
        if util.RaceEnabled {
                maybeSkip(t, withIssue("disabled under race", githubIssueID), args...)
        }
}

// UnderBazelWithIssue skips this test if we are building inside bazel,
// logging the given issue ID as the reason.
func UnderBazelWithIssue(t SkippableTest, githubIssueID int, args ...interface{}) {
        t.Helper()
        if bazel.BuiltWithBazel() {
                maybeSkip(t, withIssue("disabled under bazel", githubIssueID), args...)
        }
}

// Ignore unused warnings.
var _ = UnderBazelWithIssue

// UnderShort skips this test if the -short flag is specified.
func UnderShort(t SkippableTest, args ...interface{}) {
        t.Helper()
        if testing.Short() {
                maybeSkip(t, "disabled under -short", args...)
        }
}

// UnderStress skips this test when running under stress.
func UnderStress(t SkippableTest, args ...interface{}) {
        t.Helper()
        if Stress() {
                maybeSkip(t, "disabled under stress", args...)
        }
}

// UnderStressWithIssue skips this test when running under stress, logging the
// given issue ID as the reason.
func UnderStressWithIssue(t SkippableTest, githubIssueID int, args ...interface{}) {
        t.Helper()
        if Stress() {
                maybeSkip(t, withIssue("disabled under stress", githubIssueID), args...)
        }
}

// UnderMetamorphic skips this test during metamorphic runs, which are tests
// run with the metamorphic build tag.
func UnderMetamorphic(t SkippableTest, args ...interface{}) {
        t.Helper()
        if metamorphicutil.IsMetamorphicBuild {
                maybeSkip(t, "disabled under metamorphic", args...)
        }
}

// UnderMetamorphicWithIssue skips this test during metamorphic runs, which are
// tests run with the metamorphic build tag, logging the given issue ID as the
// reason.
func UnderMetamorphicWithIssue(t SkippableTest, githubIssueID int, args ...interface{}) {
        t.Helper()
        if metamorphicutil.IsMetamorphicBuild {
                maybeSkip(t, withIssue("disabled under metamorphic", githubIssueID), args...)
        }
}

// UnderNonTestBuild skips this test if the build does not have the crdb_test
// tag.
func UnderNonTestBuild(t SkippableTest) {
        if !buildutil.CrdbTestBuild {
                maybeSkip(t, "crdb_test tag required for this test")
        }
}

// UnderDuress skips the test if we are running under any of the
// conditions we have observed as producing slow builds.
func UnderDuress(t SkippableTest, args ...interface{}) {
        t.Helper()
        if Duress() {
                skipReason := fmt.Sprintf("duress (current config %s)", testConfig())
                maybeSkip(t, skipReason, args...)
        }
}

// UnderDuressWithIssue skips the test if we are running under any of the
// conditions we have observed as producing slow builds.
func UnderDuressWithIssue(t SkippableTest, githubIssueID int, args ...interface{}) {
        t.Helper()
        if Duress() {
                skipReason := fmt.Sprintf("duress (current config %s)", testConfig())
                maybeSkip(t, withIssue(skipReason, githubIssueID), args...)
        }
}

// Duress catures the conditions that currently lead us to
// believe that tests may be slower than normal.
func Duress() bool {
        return util.RaceEnabled || Stress() || syncutil.DeadlockEnabled
}

// UnderBench returns true iff a test is currently running under `go
// test -bench`.  When true, tests should avoid writing data on
// stdout/stderr from goroutines that run asynchronously with the
// test.
func UnderBench() bool {
        // We use here the understanding that `go test -bench` runs the
        // test executable with `-test.bench 1`.
        f := flag.Lookup("test.bench")
        return f != nil && f.Value.String() != ""
}

// UnderRemoteExecution skips the given test under remote test execution.
func UnderRemoteExecutionWithIssue(t SkippableTest, githubIssueID int, args ...interface{}) {
        t.Helper()
        isRemote := os.Getenv("REMOTE_EXEC")
        if len(isRemote) > 0 {
                maybeSkip(t, withIssue("disabled under race", githubIssueID), args...)
        }

}

func testConfig() string {
        configs := []string{}
        if Stress() {
                configs = append(configs, "stress")
        }
        if util.RaceEnabled {
                configs = append(configs, "race")
        }
        if syncutil.DeadlockEnabled {
                configs = append(configs, "deadlock")
        }
        return strings.Join(configs, ",")
}

func withIssue(reason string, githubIssueID int) string {
        return fmt.Sprintf(
                "%s. issue: https://github.com/cockroachdb/cockroach/issues/%d",
                reason,
                githubIssueID,
        )
}

var forceRunSkippedTests = envutil.EnvOrDefaultBool("COCKROACH_FORCE_RUN_SKIPPED_TESTS", false)

func maybeSkip(t SkippableTest, reason string, args ...interface{}) {
        if forceRunSkippedTests {
                return
        }

        t.Skip(append([]interface{}{reason}, args...)...)
}

// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package skip

import "github.com/cockroachdb/cockroach/pkg/util/envutil"

var nightlyStress = envutil.EnvOrDefaultBool("COCKROACH_NIGHTLY_STRESS", false)

var stress = envutil.EnvOrDefaultBool("COCKROACH_STRESS", false)

// NightlyStress returns true iff the process is running as part of CockroachDB's
// nightly stress tests.
func NightlyStress() bool {
        return nightlyStress
}

// Stress returns true iff the process is running under any instance of the stress
// harness, including the nightly one.
func Stress() bool {
        return stress || nightlyStress
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package envutil

import (
        "bytes"
        "fmt"
        "os"
        "os/user"
        "runtime"
        "sort"
        "strconv"
        "strings"
        "time"

        "github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
        "github.com/cockroachdb/redact"
)

type envVarInfo struct {
        consumer string
        present  bool
        value    string
}

var envVarRegistry struct {
        mu    syncutil.Mutex
        cache map[string]envVarInfo
}

func init() {
        ClearEnvCache()
}

func checkVarName(name string) {
        // Env vars must:
        //  - be uppercase
        //  - only contain letters, digits, and _
        valid := true
        for i := 0; valid && i < len(name); i++ {
                c := name[i]
                valid = ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_')
        }
        if !valid {
                panic("invalid env var name " + name)
        }
}

func checkInternalVarName(name string) {
        // Env vars must:
        //  - start with COCKROACH_
        //  - pass basic validity checks in checkVarName
        if !strings.HasPrefix(name, "COCKROACH_") {
                panic("invalid env var name " + name)
        }
        checkVarName(name)
}

func checkExternalVarName(name string) {
        // Env vars must:
        //  - not start with COCKROACH_
        //  - pass basic validity checks in checkVarName
        if strings.HasPrefix(name, "COCKROACH_") {
                panic("invalid env var name " + name)
        }
        checkVarName(name)
}

// getEnv performs all of the same actions as getAndCacheEnv but also includes
// a validity check of the variable name.
func getEnv(varName string, depth int) (string, bool) {
        checkInternalVarName(varName)
        return getAndCacheEnv(varName, depth+1)
}

// getExternalEnv performs all of the same actions as getEnv but also asserts
// that the variable is not of the form of an internal environment variable,
// eg. "COCKROACH_".
func getExternalEnv(varName string, depth int) (string, bool) {
        checkExternalVarName(varName)
        return getAndCacheEnv(varName, depth+1)
}

// getAndCacheEnv retrieves an environment variable, keeps track of where
// it was accessed, and checks that each environment variable is accessed
// from at most one place.
// The bookkeeping enables a report of all influential environment
// variables with "cockroach debug env". To keep this report useful,
// all relevant environment variables should be read during start up.
// This function should not be used directly; getEnv or getExternalEnv should
// be used instead.
func getAndCacheEnv(varName string, depth int) (string, bool) {
        _, consumer, _, _ := runtime.Caller(depth + 1)

        envVarRegistry.mu.Lock()
        defer envVarRegistry.mu.Unlock()

        if f, ok := envVarRegistry.cache[varName]; ok {
                if f.consumer != consumer {
                        panic("environment variable " + varName + " already used from " + f.consumer)
                }
                return f.value, f.present
        }
        v, found := os.LookupEnv(varName)
        envVarRegistry.cache[varName] = envVarInfo{consumer: consumer, present: found, value: v}
        return v, found
}

// ClearEnvCache clears saved environment values so that
// a new read access the environment again. (Used for testing)
func ClearEnvCache() {
        envVarRegistry.mu.Lock()
        defer envVarRegistry.mu.Unlock()

        envVarRegistry.cache = make(map[string]envVarInfo)
}

// GetEnvReport dumps all configuration variables that may have been
// used and their value.
func GetEnvReport() string {
        envVarRegistry.mu.Lock()
        defer envVarRegistry.mu.Unlock()

        var b bytes.Buffer
        for k, v := range envVarRegistry.cache {
                if v.present {
                        fmt.Fprintf(&b, "%s = %s # %s\n", k, v.value, v.consumer)
                } else {
                        fmt.Fprintf(&b, "# %s is not set (read from %s)\n", k, v.consumer)
                }
        }
        return b.String()
}

// GetEnvVarsUsed returns the names of all environment variables that
// may have been used.
func GetEnvVarsUsed() (result []redact.RedactableString) {
        allVarsRaw := os.Environ()
        sort.Strings(allVarsRaw)
        allVarsValues := make(map[redact.SafeString]string, len(allVarsRaw))
        allVarNames := make([]redact.SafeString, 0, len(allVarsRaw))
        for _, v := range allVarsRaw {
                i := strings.IndexByte(v, '=')
                if i < 0 {
                        continue
                }
                varName := redact.SafeString(v[:i])
                allVarNames = append(allVarNames, varName)
                var value string
                if i+1 < len(v) {
                        value = v[i+1:]
                }
                allVarsValues[varName] = value
        }

        envVarRegistry.mu.Lock()
        defer envVarRegistry.mu.Unlock()

        for _, varName := range allVarNames {
                _, crdbVar := envVarRegistry.cache[string(varName)]
                _, safeVar := safeVarRegistry[varName]
                if crdbVar || safeVar {
                        result = append(result, redact.Sprintf("%s=%s", varName, redact.SafeString(allVarsValues[varName])))
                } else if _, reportable := valueReportableUnsafeVarRegistry[varName]; reportable {
                        result = append(result, redact.Sprintf("%s=%s", varName, allVarsValues[varName]))
                } else if _, reportable := nameReportableUnsafeVarRegistry[varName]; reportable {
                        result = append(result, redact.Sprintf("%s=...", varName))
                }
                // For any env var just the name could contain too many sensitive details and we
                // don't really want them to show up in logs.
        }

        return result
}

// safeVarRegistry is the list of variables where we can both report
// the name and the value safely: the value is known to never contain
// sensitive information.
var safeVarRegistry = map[redact.SafeString]struct{}{
        // Go runtime.
        "GOGC":        {},
        "GODEBUG":     {},
        "GOMAXPROCS":  {},
        "GOTRACEBACK": {},
        "GOMEMLIMIT":  {},
        // Jemalloc configuration override.
        "MALLOC_CONF": {},
        // gRPC.
        "GRPC_GO_LOG_SEVERITY_LEVEL":  {},
        "GRPC_GO_LOG_VERBOSITY_LEVEL": {},
        // general
        "LANG": {},
        "TERM": {},
}

// valueReportableUnsafeVarRegistry is the list of variables where we can
// report the name safely, and the value as a redactable payload.
// The value may contain sensitive information, but not so sensitive
// that users would be unhappy to see them enclosed within redaction
// markers in log files.
var valueReportableUnsafeVarRegistry = map[redact.SafeString]struct{}{
        "DEBUG_HTTP2_GOROUTINES": {},
        "HOST_IP":                {},
        "LANG":                   {},
        "LC_ALL":                 {},
        "LC_COLLATE":             {},
        "LC_CTYPE":               {},
        "LC_TIME":                {},
        "LC_NUMERIC":             {},
        "LC_MESSAGES":            {},
        "LS_METRICS_ENABLED":     {},
        "TERM":                   {},
        "TZ":                     {},
        "ZONEINFO":               {},
        // From the Go runtime.
        "LOCALDOMAIN":    {},
        "RES_OPTIONS":    {},
        "HOSTALIASES":    {},
        "HTTP_PROXY":     {},
        "HTTPS_PROXY":    {},
        "NO_PROXY":       {},
        "REQUEST_METHOD": {},
}

// nameReportableUnsafeVarRegistry is the list of variables where we can
// report the name safely, but not the value in any form because it is
// too likely to contain an unsafe payload that users would be horrified
// to see in a log file, redaction markers or not.
var nameReportableUnsafeVarRegistry = map[redact.SafeString]struct{}{
        // GCP.
        "GOOGLE_API_USE_MTLS":  {},
        "GOOGLE_CLOUD_PROJECT": {},
        // AWS.
        "AWS_ACCESS_KEY":              {},
        "AWS_ACCESS_KEY_ID":           {},
        "AWS_PROFILE":                 {},
        "AWS_REGION":                  {},
        "AWS_SDK_LOAD_CONFIG":         {},
        "AWS_SECRET_ACCESS_KEY":       {},
        "AWS_SECRET_KEY":              {},
        "AWS_SESSION_TOKEN":           {},
        "AWS_SHARED_CREDENTIALS_FILE": {},
        // Azure.
        "AZURE_ACCESS_TOKEN_FILE": {},
        "AZURE_AUTH_LOCATION":     {},
        "AZURE_CONFIG_DIR":        {},
        "AZURE_GO_SDK_LOG_FILE":   {},
        "AZURE_GO_SDK_LOG_LEVEL":  {},
        // Google auth.
        "GAE_APPLICATION":     {},
        "GAE_DEPLOYMENT_ID":   {},
        "GAE_ENV":             {},
        "GAE_INSTANCE":        {},
        "GAE_LONG_APP_ID":     {},
        "GAE_MINOR_VERSION":   {},
        "GAE_MODULE_INSTANCE": {},
        "GAE_MODULE_NAME":     {},
        "GAE_PARTITION":       {},
        "GAE_SERVICE":         {},
        // Kerberos.
        "KRB5CCNAME": {},
        // Pprof.
        "PPROF_BINARY_PATH": {},
        "PPROF_TMPDIR":      {},
        "PPROF_TOOLS":       {},
        // Sentry-go.
        "SENTRY_RELEASE": {},
}

// GetShellCommand returns a complete command to run with a prefix of the command line.
func GetShellCommand(cmd string) []string {
        if runtime.GOOS == "windows" {
                if shell := os.Getenv("COMSPEC"); len(shell) > 0 {
                        return []string{shell, "/C", cmd}
                }
                return []string{`C:\Windows\system32\cmd.exe`, "/C", cmd}
        }
        if shell := os.Getenv("SHELL"); len(shell) > 0 {
                return []string{shell, "-c", cmd}
        }

        return []string{"/bin/sh", "-c", cmd}
}

// HomeDir returns the user's home directory, as determined by the env
// var HOME, if it exists, and otherwise the system's idea of the user
// configuration (e.g. on non-UNIX systems).
func HomeDir() (string, error) {
        if homeDir := os.Getenv("HOME"); len(homeDir) > 0 {
                return homeDir, nil
        }
        userAcct, err := user.Current()
        if err != nil {
                return "", err
        }
        return userAcct.HomeDir, nil
}

// EnvString returns the value set by the specified environment variable. The
// depth argument indicates the stack depth of the caller that should be
// associated with the variable.
// The returned boolean flag indicates if the variable is set.
func EnvString(name string, depth int) (string, bool) {
        return getEnv(name, depth+1)
}

// ExternalEnvString returns the value set by the specified environment
// variable. Only non-CRDB environment variables should be accessed via this
// method. CRDB specific variables should be accessed via EnvString. The depth
// argument indicates the stack depth of the caller that should be associated
// with the variable. The returned boolean flag indicates if the variable is
// set.
func ExternalEnvString(name string, depth int) (string, bool) {
        return getExternalEnv(name, depth+1)
}

// EnvOrDefaultString returns the value set by the specified
// environment variable, if any, otherwise the specified default
// value.
func EnvOrDefaultString(name string, value string) string {
        if v, present := getEnv(name, 1); present {
                return v
        }
        return value
}

// EnvOrDefaultBool returns the value set by the specified environment
// variable, if any, otherwise the specified default value.
//
// N.B. EnvOrDefaultBool has the desired side-effect of populating envVarRegistry.cache.
// It has to be invoked during (var) init; otherwise, cli/start.go:reportConfiguration will not report the
// value of this environment variable in the server log, upon startup.
//
//        Correct Usage: var allowUpgradeToDev = envutil.EnvOrDefaultBool("COCKROACH_UPGRADE_TO_DEV_VERSION", false)
//
//        Incorrect Usage: func() {
//                                                                                        ...
//                                                                                        var allowUpgradeToDev envutil.EnvOrDefaultBool("COCKROACH_UPGRADE_TO_DEV_VERSION", false)
//                                                                                }
//
// N.B. The same rule applies to the remaining EnvOrDefaultXXX defined here.
func EnvOrDefaultBool(name string, value bool) bool {
        if str, present := getEnv(name, 1); present {
                v, err := strconv.ParseBool(str)
                if err != nil {
                        panic(fmt.Sprintf("error parsing %s: %s", name, err))
                }
                return v
        }
        return value
}

// EnvOrDefaultInt returns the value set by the specified environment
// variable, if any, otherwise the specified default value.
func EnvOrDefaultInt(name string, value int) int {
        if str, present := getEnv(name, 1); present {
                v, err := strconv.ParseInt(str, 0, 0)
                if err != nil {
                        panic(fmt.Sprintf("error parsing %s: %s", name, err))
                }
                return int(v)
        }
        return value
}

// EnvOrDefaultInt64 returns the value set by the specified environment
// variable, if any, otherwise the specified default value.
func EnvOrDefaultInt64(name string, value int64) int64 {
        if str, present := getEnv(name, 1); present {
                v, err := strconv.ParseInt(str, 0, 64)
                if err != nil {
                        panic(fmt.Sprintf("error parsing %s: %s", name, err))
                }
                return v
        }
        return value
}

// EnvOrDefaultFloat64 returns the value set by the specified environment
// variable, if any, otherwise the specified default value.
func EnvOrDefaultFloat64(name string, value float64) float64 {
        if str, present := getEnv(name, 1); present {
                v, err := strconv.ParseFloat(str, 64)
                if err != nil {
                        panic(fmt.Sprintf("error parsing %s: %s", name, err))
                }
                return v
        }
        return value
}

var _ = EnvOrDefaultFloat64 // silence unused warning

// EnvOrDefaultBytes returns the value set by the specified environment
// variable, if any, otherwise the specified default value.
func EnvOrDefaultBytes(name string, value int64) int64 {
        if str, present := getEnv(name, 1); present {
                v, err := humanizeutil.ParseBytes(str)
                if err != nil {
                        panic(fmt.Sprintf("error parsing %s: %s", name, err))
                }
                return v
        }
        return value
}

// EnvOrDefaultDuration returns the value set by the specified environment
// variable, if any, otherwise the specified default value.
func EnvOrDefaultDuration(name string, value time.Duration) time.Duration {
        if str, present := getEnv(name, 1); present {
                v, err := time.ParseDuration(str)
                if err != nil {
                        panic(fmt.Sprintf("error parsing %s: %s", name, err))
                }
                return v
        }
        return value
}

// TB is a slimmed down version of testing.T for use below.
// We would like to use testutils.TB but this is not possible
// due to a dependency cycle.
type TB interface {
        Fatal(args ...interface{})
        Helper()
}

// TestSetEnv sets an environment variable and the cleanup function
// resets it to the original value.
func TestSetEnv(t TB, name string, value string) func() {
        t.Helper()
        ClearEnvCache()
        before, exists := os.LookupEnv(name)

        if err := os.Setenv(name, value); err != nil {
                t.Fatal(err)
        }
        return func() {
                if exists {
                        if err := os.Setenv(name, before); err != nil {
                                t.Fatal(err)
                        }
                } else {
                        if err := os.Unsetenv(name); err != nil {
                                t.Fatal(err)
                        }
                }
                ClearEnvCache()
        }
}

// TestUnsetEnv unsets an environment variable and the cleanup function
// resets it to the original value.
func TestUnsetEnv(t TB, name string) func() {
        t.Helper()
        ClearEnvCache()
        before, exists := os.LookupEnv(name)
        if !exists {
                return func() {}
        }
        if err := os.Unsetenv(name); err != nil {
                t.Fatal(err)
        }
        return func() {
                if err := os.Setenv(name, before); err != nil {
                        t.Fatal(err)
                }
                ClearEnvCache()
        }
}

// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import (
        "time"

        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
)

// EveryN provides a way to rate limit spammy events. It tracks how recently a
// given event has occurred so that it can determine whether it's worth
// handling again.
//
// The zero value for EveryN is usable and is equivalent to Every(0), meaning
// that all calls to ShouldProcess will return true.
//
// NOTE: If you specifically care about log messages, you should use the
// version of this in the log package, as it integrates with the verbosity
// flags.
type EveryN struct {
        // N is the minimum duration of time between log messages.
        N time.Duration

        syncutil.Mutex
        lastProcessed time.Time
}

// Every is a convenience constructor for an EveryN object that allows a log
// message every n duration.
func Every(n time.Duration) EveryN {
        return EveryN{N: n}
}

// ShouldProcess returns whether it's been more than N time since the last event.
func (e *EveryN) ShouldProcess(now time.Time) bool {
        var shouldProcess bool
        e.Lock()
        if now.Sub(e.lastProcessed) >= e.N {
                shouldProcess = true
                e.lastProcessed = now
        }
        e.Unlock()
        return shouldProcess
}

// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import (
        "bytes"
        "fmt"
        "math"
        "math/bits"
        "sort"
)

// FastIntMap is a replacement for map[int]int which is more efficient when both
// keys and values are small. It can be passed by value (but Copy must be used
// for independent modification of copies).
type FastIntMap struct {
        small [numWords]uint64
        large map[int]int
}

// Empty returns true if the map is empty.
func (m FastIntMap) Empty() bool {
        return m.small == [numWords]uint64{} && len(m.large) == 0
}

// Copy returns a FastIntMap that can be independently modified.
func (m FastIntMap) Copy() FastIntMap {
        if m.large == nil {
                return FastIntMap{small: m.small}
        }
        largeCopy := make(map[int]int, len(m.large))
        for k, v := range m.large {
                largeCopy[k] = v
        }
        return FastIntMap{large: largeCopy}
}

// Set maps a key to the given value.
func (m *FastIntMap) Set(key, val int) {
        if m.large == nil {
                if key >= 0 && key < numVals && val >= 0 && val <= maxValue {
                        m.setSmallVal(uint32(key), int32(val))
                        return
                }
                m.large = m.toLarge()
                m.small = [numWords]uint64{}
        }
        m.large[key] = val
}

// Unset unmaps the given key.
func (m *FastIntMap) Unset(key int) {
        if m.large == nil {
                if key < 0 || key >= numVals {
                        return
                }
                m.setSmallVal(uint32(key), -1)
        }
        delete(m.large, key)
}

// Get returns the current value mapped to key, or (-1, false) if the
// key is unmapped.
func (m FastIntMap) Get(key int) (value int, ok bool) {
        if m.large == nil {
                if key < 0 || key >= numVals {
                        return -1, false
                }
                val := m.getSmallVal(uint32(key))
                return int(val), (val != -1)
        }
        if value, ok = m.large[key]; ok {
                return value, true
        }
        return -1, false
}

// GetDefault returns the current value mapped to key, or 0 if the key is
// unmapped.
func (m FastIntMap) GetDefault(key int) (value int) {
        value, ok := m.Get(key)
        if !ok {
                return 0
        }
        return value
}

// Len returns the number of keys in the map.
func (m FastIntMap) Len() int {
        if m.large != nil {
                return len(m.large)
        }
        res := 0
        for w := 0; w < numWords; w++ {
                v := m.small[w]
                // We want to count the number of non-zero groups. To do this, we OR all
                // the bits of each group into the low-bit of that group, apply a mask
                // selecting just those low bits and count the number of 1s.
                // To OR the bits efficiently, we first OR the high half of each group into
                // the low half of each group, and repeat.
                // Note: this code assumes that numBits is a power of two.
                for i := uint32(numBits / 2); i > 0; i /= 2 {
                        v |= (v >> i)
                }
                res += bits.OnesCount64(v & groupLowBitMask)
        }
        return res
}

// MaxKey returns the maximum key that is in the map. If the map
// is empty, returns ok=false.
func (m FastIntMap) MaxKey() (_ int, ok bool) {
        if m.large == nil {
                for w := numWords - 1; w >= 0; w-- {
                        if val := m.small[w]; val != 0 {
                                // Example (with numBits = 4)
                                //   pos:   3    2    1    0
                                //   bits:  0000 0000 0010 0000
                                // To get the left-most non-zero group, we calculate how many groups are
                                // covered by the leading zeros.
                                pos := numValsPerWord - 1 - bits.LeadingZeros64(val)/numBits
                                return w*numValsPerWord + pos, true
                        }
                }
                return 0, false
        }
        if len(m.large) == 0 {
                return 0, false
        }
        max := math.MinInt
        for k := range m.large {
                if max < k {
                        max = k
                }
        }
        return max, true
}

// MaxValue returns the maximum value that is in the map. If the map
// is empty, returns (0, false).
func (m FastIntMap) MaxValue() (_ int, ok bool) {
        if m.large == nil {
                // In the small case, all values are positive.
                max := -1
                for w := 0; w < numWords; w++ {
                        if m.small[w] != 0 {
                                // To optimize for small maps, we stop when the rest of the values are
                                // unset. See the comment in MaxKey.
                                numVals := numValsPerWord - bits.LeadingZeros64(m.small[w])/numBits
                                for i := 0; i < numVals; i++ {
                                        val := int(m.getSmallVal(uint32(w*numValsPerWord + i)))
                                        // NB: val is -1 here if this key isn't in the map.
                                        if max < val {
                                                max = val
                                        }
                                }
                        }
                }
                if max == -1 {
                        return 0, false
                }
                return max, true
        }
        if len(m.large) == 0 {
                return 0, false
        }
        max := math.MinInt
        for _, v := range m.large {
                if max < v {
                        max = v
                }
        }
        return max, true
}

// ForEach calls the given function for each key/value pair in the map (in
// arbitrary order).
func (m FastIntMap) ForEach(fn func(key, val int)) {
        if m.large == nil {
                for i := 0; i < numVals; i++ {
                        if val := m.getSmallVal(uint32(i)); val != -1 {
                                fn(i, int(val))
                        }
                }
        } else {
                for k, v := range m.large {
                        fn(k, v)
                }
        }
}

// ContentsIntoBuffer writes the contents of the map into the provided buffer in
// the following format:
//
//        key1:val1 key2:val2 ...
//
// The keys are in ascending order.
func (m FastIntMap) ContentsIntoBuffer(buf *bytes.Buffer) {
        first := true
        if m.large != nil {
                keys := make([]int, 0, len(m.large))
                for k := range m.large {
                        keys = append(keys, k)
                }
                sort.Ints(keys)
                for _, k := range keys {
                        if !first {
                                buf.WriteByte(' ')
                        }
                        first = false
                        fmt.Fprintf(buf, "%d:%d", k, m.large[k])
                }
        } else {
                for i := 0; i < numVals; i++ {
                        if val := m.getSmallVal(uint32(i)); val != -1 {
                                if !first {
                                        buf.WriteByte(' ')
                                }
                                first = false
                                fmt.Fprintf(buf, "%d:%d", i, val)
                        }
                }
        }
}

// String prints out the contents of the map in the following format:
//
//        map[key1:val1 key2:val2 ...]
//
// The keys are in ascending order.
func (m FastIntMap) String() string {
        var buf bytes.Buffer
        buf.WriteString("map[")
        m.ContentsIntoBuffer(&buf)
        buf.WriteByte(']')
        return buf.String()
}

// These constants determine the "small" representation: we pack <numVals>
// values of <numBits> bits into <numWords> 64-bit words. Each value is 0 if the
// corresponding key is not set, otherwise it is the value+1.
//
// It's desirable for efficiency that numBits, numValsPerWord are powers of two.
//
// The current settings support a map from keys in [0, 31] to values in [0, 14].
// Note that one value is reserved to indicate an unmapped element.
const (
        numWords       = 2
        numBits        = 4
        numValsPerWord = 64 / numBits              // 16
        numVals        = numWords * numValsPerWord // 32
        mask           = (1 << numBits) - 1
        maxValue       = mask - 1
        // Mask for the low bits of each group: 0001 0001 0001 ...
        groupLowBitMask = 0x1111111111111111
)

// Returns -1 if the value is unmapped.
func (m FastIntMap) getSmallVal(idx uint32) int32 {
        word := idx / numValsPerWord
        pos := (idx % numValsPerWord) * numBits
        return int32((m.small[word]>>pos)&mask) - 1
}

func (m *FastIntMap) setSmallVal(idx uint32, val int32) {
        word := idx / numValsPerWord
        pos := (idx % numValsPerWord) * numBits
        // Clear out any previous value
        m.small[word] &= ^(mask << pos)
        m.small[word] |= uint64(val+1) << pos
}

func (m *FastIntMap) toLarge() map[int]int {
        res := make(map[int]int, numVals)
        for i := 0; i < numVals; i++ {
                val := m.getSmallVal(uint32(i))
                if val != -1 {
                        res[i] = int(val)
                }
        }
        return res
}

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

// See grunning.Supported() for an explanation behind this build tag.
//
//go:build !bazel

package grunning

func grunningnanos() int64 { return 0 }

func supported() bool { return false }

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

// Package grunning is a library that's able to retrieve on-CPU running time for
// individual goroutines. It relies on using a patched Go and provides a
// primitive for fine-grained CPU attribution and control. See #82356 for more
// details.
package grunning

import "time"

// Time returns the time spent by the current goroutine in the running state.
func Time() time.Duration {
        return time.Duration(grunningnanos())
}

// Difference is a helper function to compute the absolute difference between
// two durations.
func Difference(a, b time.Duration) time.Duration {
        diff := a.Nanoseconds() - b.Nanoseconds()
        if diff < 0 {
                diff = -diff
        }
        return time.Duration(diff)
}

// Elapsed returns the running time spent doing some piece of work, with
// grunning.Time() measurements from the start and end.
//
// NB: This only exists due to grunning.Time()'s non-monotonicity, a bug in our
// runtime patch: https://github.com/cockroachdb/cockroach/issues/95529. We can
// get rid of this, keeping just grunning.Difference(), if that bug is fixed.
// The bug results in slight {over,under}-estimation of the running time (the
// latter breaking monotonicity), but is livable with our current uses of this
// library.
func Elapsed(start, end time.Duration) time.Duration {
        diff := end.Nanoseconds() - start.Nanoseconds()
        if diff < 0 {
                diff = 0
        }
        return time.Duration(diff)
}

// Supported returns true iff per-goroutine running time is available in this
// build. We use a patched Go runtime for all platforms officially supported for
// CRDB when built using Bazel.
func Supported() bool {
        return supported()
}

// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import (
        "hash/crc32"

        "github.com/cockroachdb/errors"
)

// CRC32 computes the Castagnoli CRC32 of the given data.
func CRC32(data []byte) uint32 {
        hash := crc32.New(crc32.MakeTable(crc32.Castagnoli))
        if _, err := hash.Write(data); err != nil {
                panic(errors.Wrap(err, `"It never returns an error." -- https://golang.org/pkg/hash`))
        }
        return hash.Sum32()
}

// Magic FNV Base constant as suitable for a FNV-64 hash.
const fnvBase = uint64(14695981039346656037)
const fnvPrime = 1099511628211

// FNV64 encapsulates the hash state.
type FNV64 struct {
        sum uint64
}

// MakeFNV64 initializes a new FNV64 hash state.
func MakeFNV64() FNV64 {
        return FNV64{sum: fnvBase}
}

// Init initializes FNV64 to starting value.
func (f *FNV64) Init() {
        f.sum = fnvBase
}

// IsInitialized returns true if the hash struct was initialized, which happens
// automatically when created through MakeFNV64 above.
func (f *FNV64) IsInitialized() bool {
        return f.sum != 0
}

// Add modifies the underlying FNV64 state by accumulating the given integer
// hash to the existing state.
func (f *FNV64) Add(c uint64) {
        f.sum *= fnvPrime
        f.sum ^= c
}

// Sum returns the hash value accumulated till now.
func (f *FNV64) Sum() uint64 {
        return f.sum
}

// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package humanizeutil

import (
        "math"

        "github.com/cockroachdb/redact"
        "github.com/dustin/go-humanize"
)

// Count formats a unitless integer value like a row count. It uses separating
// commas for large values (e.g. "1,000,000").
func Count(val uint64) redact.SafeString {
        if val > math.MaxInt64 {
                val = math.MaxInt64
        }
        return redact.SafeString(humanize.Comma(int64(val)))
}

func Countf(val float64) redact.SafeString {
        if val > math.MaxInt64 {
                val = math.MaxInt64
        }
        return redact.SafeString(humanize.Commaf(val))
}

// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package humanizeutil

import (
        "fmt"
        "time"

        "github.com/cockroachdb/redact"
)

// Duration formats a duration in a user-friendly way. The result is not exact
// and the granularity is no smaller than microseconds.
//
// Examples:
//
//        0              ->  "0µs"
//        123456ns       ->  "123µs"
//        12345678ns     ->  "12ms"
//        12345678912ns  ->  "1.2s"
func Duration(val time.Duration) redact.SafeString {
        val = val.Round(time.Microsecond)
        if val == 0 {
                return "0µs"
        }

        // Everything under 1ms will show up as µs.
        if val < time.Millisecond {
                return redact.SafeString(val.String())
        }
        // Everything in-between 1ms and 1s will show up as ms.
        if val < time.Second {
                return redact.SafeString(val.Round(time.Millisecond).String())
        }
        // Everything in-between 1s and 1m will show up as seconds with one decimal.
        if val < time.Minute {
                return redact.SafeString(val.Round(100 * time.Millisecond).String())
        }

        // Everything larger is rounded to the nearest second.
        return redact.SafeString(val.Round(time.Second).String())
}

// LongDuration formats a duration that is expected to be on the order of
// minutes / hours / days in a user-friendly way. The result is not exact and
// the granularity is no smaller than seconds.
//
// Examples:
//   - 0 seconds
//   - 1 second
//   - 3 minutes
//   - 1 hour
//   - 5 days
//   - 1000 days
func LongDuration(val time.Duration) redact.SafeString {
        var round time.Duration
        var unit string

        switch {
        case val < time.Minute:
                round = time.Second
                unit = "second"

        case val < time.Hour:
                round = time.Minute
                unit = "minute"

        case val < 24*time.Hour:
                round = time.Hour
                unit = "hour"

        default:
                round = 24 * time.Hour
                unit = "day"
        }

        n := int64(val.Round(round) / round)
        s := ""
        if n != 1 {
                s = "s"
        }
        return redact.SafeString(fmt.Sprintf("%d %s%s", n, unit, s))
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package humanizeutil

import (
        "flag"
        "fmt"
        "math"
        "sync/atomic"
        "time"

        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/redact"
        "github.com/dustin/go-humanize"
        "github.com/spf13/pflag"
)

// IBytes is an int64 version of go-humanize's IBytes.
func IBytes(value int64) redact.SafeString {
        if value < 0 {
                return redact.SafeString("-" + humanize.IBytes(uint64(-value)))
        }
        return redact.SafeString(humanize.IBytes(uint64(value)))
}

// ParseBytes is an int64 version of go-humanize's ParseBytes.
func ParseBytes(s string) (int64, error) {
        if len(s) == 0 {
                return 0, errors.New("parsing \"\": invalid syntax")
        }
        var startIndex int
        var negative bool
        if s[0] == '-' {
                negative = true
                startIndex = 1
        }
        value, err := humanize.ParseBytes(s[startIndex:])
        if err != nil {
                return 0, err
        }
        if value > math.MaxInt64 {
                return 0, errors.Errorf("too large: %s", s)
        }
        if negative {
                return -int64(value), nil
        }
        return int64(value), nil
}

// BytesValue is a struct that implements flag.Value and pflag.Value
// suitable to create command-line parameters that accept sizes
// specified using a format recognized by humanize.
// The value is written atomically, so that it is safe to use this
// struct to make a parameter configurable that is used by an
// asynchronous process spawned before command-line argument handling.
// This is useful e.g. for the log file settings which are used
// by the asynchronous log file GC daemon.
type BytesValue struct {
        val   *int64
        isSet bool
}

var _ flag.Value = &BytesValue{}
var _ pflag.Value = &BytesValue{}

// NewBytesValue creates a new pflag.Value bound to the specified
// int64 variable. It also happens to be a flag.Value.
func NewBytesValue(val *int64) *BytesValue {
        return &BytesValue{val: val}
}

// Set implements the flag.Value and pflag.Value interfaces.
func (b *BytesValue) Set(s string) error {
        v, err := ParseBytes(s)
        if err != nil {
                return err
        }
        if b.val == nil {
                b.val = new(int64)
        }
        atomic.StoreInt64(b.val, v)
        b.isSet = true
        return nil
}

// Type implements the pflag.Value interface.
func (b *BytesValue) Type() string {
        return "bytes"
}

// String implements the flag.Value and pflag.Value interfaces.
func (b *BytesValue) String() string {
        return redact.StringWithoutMarkers(b)
}

// SafeFormat implements the redact.SafeFormatter interface.
func (b *BytesValue) SafeFormat(w redact.SafePrinter, _ rune) {
        // When b.val is nil, the real value of the flag will only be known after a
        // Resolve() call. We do not want our flag package to report an erroneous
        // default value for this flag. So the value we return here must cause
        // defaultIsZeroValue to return true:
        // https://github.com/spf13/pflag/blob/v1.0.5/flag.go#L724
        if b.val == nil {
                w.SafeString("<nil>")
                return
        }
        // This uses the MiB, GiB, etc suffixes. If we use humanize.Bytes() we get
        // the MB, GB, etc suffixes, but the conversion is done in multiples of 1000
        // vs 1024.
        w.Print(IBytes(atomic.LoadInt64(b.val)))
}

// IsSet returns true iff Set has successfully been called.
func (b *BytesValue) IsSet() bool {
        return b.isSet
}

// DataRate formats the passed byte count over duration as "x MiB/s".
func DataRate(bytes int64, elapsed time.Duration) redact.SafeString {
        if bytes == 0 {
                return "0"
        }
        if elapsed == 0 {
                return "inf"
        }
        return redact.SafeString(fmt.Sprintf("%0.2f MiB/s",
                (float64(bytes)/elapsed.Seconds())/float64(1<<20)))
}

// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package addr

import (
        "fmt"
        "net"
        "strconv"
        "strings"

        "github.com/cockroachdb/errors"
        "github.com/spf13/pflag"
)

// AddrWithDefaultLocalhost returns addr with the host set
// to localhost if it is empty.
func AddrWithDefaultLocalhost(addr string) (string, error) {
        host, port, err := net.SplitHostPort(addr)
        if err != nil {
                return "", err
        }
        if host == "" {
                host = "localhost"
        }
        return net.JoinHostPort(host, port), nil
}

// SplitHostPort is like net.SplitHostPort however it supports
// addresses without a port number. In that case, the provided port
// number is used.
func SplitHostPort(v string, defaultPort string) (addr string, port string, err error) {
        addr, port, err = net.SplitHostPort(v)
        if err != nil {
                var aerr *net.AddrError
                if errors.As(err, &aerr) {
                        if strings.HasPrefix(aerr.Err, "too many colons") {
                                // Maybe this was an IPv6 address using the deprecated syntax
                                // without '[...]'? Try that to help the user with a hint.
                                // Note: the following is valid even if defaultPort is empty.
                                // (An empty port number is always a valid listen address.)
                                maybeAddr := "[" + v + "]:" + defaultPort
                                addr, port, err = net.SplitHostPort(maybeAddr)
                                if err == nil {
                                        err = errors.WithHintf(
                                                errors.Newf("invalid address format: %q", v),
                                                "enclose IPv6 addresses within [...], e.g. \"[%s]\"", v)
                                }
                        } else if strings.HasPrefix(aerr.Err, "missing port") {
                                // It's inconvenient that SplitHostPort doesn't know how to ignore
                                // a missing port number. Oh well.
                                addr, port, err = net.SplitHostPort(v + ":" + defaultPort)
                        }
                }
        }
        if err == nil && port == "" {
                port = defaultPort
        }
        return addr, port, err
}

type addrSetter struct {
        addr *string
        port *string
}

// NewAddrSetter creates a new pflag.Value raps a address/port
// configuration option pair and enables setting them both with a
// single command-line flag.
func NewAddrSetter(hostOption, portOption *string) pflag.Value {
        return &addrSetter{addr: hostOption, port: portOption}
}

// String implements the pflag.Value interface.
func (a addrSetter) String() string {
        return net.JoinHostPort(*a.addr, *a.port)
}

// Type implements the pflag.Value interface.
func (a addrSetter) Type() string { return "<addr/host>[:<port>]" }

// Set implements the pflag.Value interface.
func (a addrSetter) Set(v string) error {
        addr, port, err := SplitHostPort(v, *a.port)
        if err != nil {
                return err
        }
        *a.addr = addr
        *a.port = port
        return nil
}

type portRangeSetter struct {
        lower *int
        upper *int
}

// NewPortRangeSetter creates a new pflag.Value that allows setting a
// lower and upper bound of a port range with a single setting.
func NewPortRangeSetter(lower, upper *int) pflag.Value {
        return portRangeSetter{lower: lower, upper: upper}
}

// String implements the pflag.Value interface.
func (a portRangeSetter) String() string {
        return fmt.Sprintf("%d-%d", *a.lower, *a.upper)
}

// Type implements the pflag.Value interface.
func (a portRangeSetter) Type() string { return "<lower>-<upper>" }

// Set implements the pflag.Value interface.
func (a portRangeSetter) Set(v string) error {
        parts := strings.Split(v, "-")
        if len(parts) > 2 {
                return errors.New("invalid port range: too many parts")
        }

        if len(parts) < 2 || parts[1] == "" {
                return errors.New("invalid port range: too few parts")
        }

        lower, err := strconv.Atoi(parts[0])
        if err != nil {
                return errors.Wrap(err, "invalid port range")
        }

        upper, err := strconv.Atoi(parts[1])
        if err != nil {
                return errors.Wrap(err, "invalid port range")
        }

        if lower > upper {
                return errors.Newf("invalid port range: lower bound (%d) > upper bound (%d)", lower, upper)
        }
        *a.lower = lower
        *a.upper = upper

        return nil
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

// NoCopy may be embedded into structs which must not be copied
// after the first use.
//
// See https://github.com/golang/go/issues/8005#issuecomment-190753527
// for details.
type NoCopy struct{}

// Silence unused warnings.
var _ = NoCopy{}

// Lock is a no-op used by -copylocks checker from `go vet`.
func (*NoCopy) Lock() {}

// Unlock is a no-op used by -copylocks checker from `go vet`.
func (*NoCopy) Unlock() {}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import "github.com/cockroachdb/redact"

// Pluralize returns a single character 's' unless n == 1.
func Pluralize(n int64) redact.SafeString {
        if n == 1 {
                return ""
        }
        return "s"
}

// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

//go:build !race

package util

// RaceEnabled is true if CockroachDB was built with the race build tag.
const RaceEnabled = false

// EnableRacePreemptionPoints enables goroutine preemption points declared with
// RacePreempt for builds using the race build tag.
func EnableRacePreemptionPoints() func() { return func() {} }

// RacePreempt adds a goroutine preemption point if CockroachDB was built with
// the race build tag and preemption points have been enabled. The function is a
// no-op (and should be optimized out through dead code elimination) if the race
// build tag was not used.
func RacePreempt() {}

// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package randutil

import (
        crypto_rand "crypto/rand"
        "encoding/binary"
        "fmt"
        "log" // Don't bring cockroach/util/log into this low-level package.
        "math/rand"
        "runtime"
        "strings"
        "time"
        _ "unsafe" // required by go:linkname

        "github.com/cockroachdb/cockroach/pkg/util/envutil"
        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
)

// lockedSource is a thread safe math/rand.Source. See math/rand/rand.go.
type lockedSource struct {
        mu  syncutil.Mutex
        src rand.Source64
}

// NewLockedSource creates random source protected by mutex.
func NewLockedSource(seed int64) rand.Source {
        return &lockedSource{
                src: rand.NewSource(seed).(rand.Source64),
        }
}

func (rng *lockedSource) Int63() (n int64) {
        rng.mu.Lock()
        defer rng.mu.Unlock()
        n = rng.src.Int63()
        return
}

func (rng *lockedSource) Uint64() (n uint64) {
        rng.mu.Lock()
        defer rng.mu.Unlock()
        n = rng.src.Uint64()
        return
}

func (rng *lockedSource) Seed(seed int64) {
        rng.mu.Lock()
        defer rng.mu.Unlock()
        rng.src.Seed(seed)
}

// globalSeed contains a pseudo random seed that should only be used in tests.
var globalSeed int64

// rng is a random number generator used to generate seeds for test random
// number generators.
var rng *rand.Rand

// lastTestName is the function name of the last test we have seen.
var lastTestName string

// mtx protects rng and lastTestName.
var mtx syncutil.Mutex

// Initializes the global random seed. This value can be specified via an
// environment variable COCKROACH_RANDOM_SEED=x.
func init() {
        globalSeed = envutil.EnvOrDefaultInt64("COCKROACH_RANDOM_SEED", NewPseudoSeed())
        rng = rand.New(rand.NewSource(globalSeed))
}

// NewPseudoSeed generates a seed from crypto/rand.
func NewPseudoSeed() int64 {
        var seed int64
        err := binary.Read(crypto_rand.Reader, binary.LittleEndian, &seed)
        if err != nil {
                panic(fmt.Sprintf("could not read from crypto/rand: %s", err))
        }
        return seed
}

// NewPseudoRand returns an instance of math/rand.Rand seeded from the
// environment variable COCKROACH_RANDOM_SEED.  If that variable is not set,
// crypto/rand is used to generate a seed. The seed is also returned so we can
// easily and cheaply generate unique streams of numbers. The created object is
// not safe for concurrent access.
func NewPseudoRand() (*rand.Rand, int64) {
        seed := envutil.EnvOrDefaultInt64("COCKROACH_RANDOM_SEED", NewPseudoSeed())
        return rand.New(rand.NewSource(seed)), seed
}

// Same as NewPseudoRand, but the returned Rand is using thread safe underlying source.
func NewLockedPseudoRand() (*rand.Rand, int64) {
        seed := envutil.EnvOrDefaultInt64("COCKROACH_RANDOM_SEED", NewPseudoSeed())
        return rand.New(NewLockedSource(seed)), seed
}

// NewPseudoRandWithGlobalSeed returns an instance of math/rand.Rand, which is
// seeded with the global seed.
// It's _not_ intended to be called directly from a test; use NewTestRand for that.
// Instead, this function is useful for seeding other random number generators, on which the tests
// may depend; e.g., metamorphic constants.
// N.B. unlike NewTestRand, this function _never_ reseeds rng.
func NewPseudoRandWithGlobalSeed() (*rand.Rand, int64) {
        return rand.New(rand.NewSource(globalSeed)), globalSeed
}

// NewTestRand returns an instance of math/rand.Rand seeded from rng, which is
// seeded with the global seed. If the caller is a test with a different
// path-qualified name than the previous caller, rng is reseeded from the global
// seed. This rand.Rand is useful in testing to produce deterministic,
// reproducible behavior.
func NewTestRand() (*rand.Rand, int64) {
        return newTestRandImpl(rand.NewSource)
}

// NewLockedTestRand is identical to NewTestRand but returned rand.Rand is using
// thread safe underlying source.
func NewLockedTestRand() (*rand.Rand, int64) {
        return newTestRandImpl(NewLockedSource)
}

func newTestRandImpl(f func(int64) rand.Source) (*rand.Rand, int64) {
        mtx.Lock()
        defer mtx.Unlock()
        fxn := getTestName()
        if fxn != "" && lastTestName != fxn {
                // Re-seed rng (the source of seeds for test random number generators) with
                // the global seed so that individual tests are reproducible using the
                // random seed.
                lastTestName = fxn
                rng = rand.New(f(globalSeed))
        }
        seed := rng.Int63()
        return rand.New(f(seed)), seed
}

// NewTestRandWithSeed returns an instance of math/rand.Rand, similar to
// NewTestRand, but with the seed specified.
func NewTestRandWithSeed(seed int64) *rand.Rand {
        mtx.Lock()
        defer mtx.Unlock()
        fxn := getTestName()
        if fxn != "" && lastTestName != fxn {
                lastTestName = fxn
        }
        return rand.New(rand.NewSource(seed))
}

// RandIntInRange returns a value in [min, max)
func RandIntInRange(r *rand.Rand, min, max int) int {
        return min + r.Intn(max-min)
}

// RandInt63InRange returns a value in [min, max)
func RandInt63InRange(r *rand.Rand, min, max int64) int64 {
        return min + r.Int63n(max-min)
}

// RandUint64n generates a 64-bit random number in [0, n) range.
// Note: n == 0 means n is math.MaxUint64 + 1
func RandUint64n(r *rand.Rand, n uint64) uint64 {
        if n == 0 {
                return r.Uint64()
        }
        // If n is less than 64 bits, delegate to 63 bit version.
        if n < (1 << 63) {
                return uint64(r.Int63n(int64(n)))
        }
        v := r.Uint64()
        for v > n {
                v = r.Uint64()
        }
        return v
}

// RandDuration returns a random duration in [0, max).
func RandDuration(r *rand.Rand, max time.Duration) time.Duration {
        return time.Duration(r.Int63n(int64(max)))
}

var randLetters = []byte("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")

// RandBytes returns a byte slice of the given length with random
// data.
func RandBytes(r *rand.Rand, size int) []byte {
        if size <= 0 {
                return nil
        }

        arr := make([]byte, size)
        for i := 0; i < len(arr); i++ {
                arr[i] = randLetters[r.Intn(len(randLetters))]
        }
        return arr
}

// FastUint32 returns a lock free uint32 value. Compared to rand.Uint32, this
// implementation scales. We're using the go runtime's implementation through a
// linker trick.
//
//go:linkname FastUint32 runtime.fastrand
func FastUint32() uint32

// FastInt63 returns a non-negative pseudo-random 63-bit integer as an int64.
// Compared to rand.Int63(), this implementation scales.
func FastInt63() int64 {
        x, y := FastUint32(), FastUint32() // 32-bit halves
        u := uint64(x)<<32 ^ uint64(y)
        i := int64(u >> 1) // clear sign bit
        return i
}

// ReadTestdataBytes reads random bytes, but then nudges them into printable
// ASCII, *reducing their randomness* to make them a little friendlier for
// humans using them as testdata.
func ReadTestdataBytes(r *rand.Rand, arr []byte) {
        _, _ = r.Read(arr)
        for i := range arr {
                arr[i] = arr[i] & 0x7F // mask out non-ascii
                if arr[i] < ' ' {      // Nudge the control chars up, into the letters.
                        arr[i] += 'A'
                }
        }
}

// PrintableKeyAlphabet to use with random string generation to produce strings
// that doesn't need to be escaped when found as a part of a key and is
// generally human printable.
const PrintableKeyAlphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"

// RandString generates a random string of the desired length from the
// input alphabet. It is useful when you want to generate keys that would
// be printable without further escaping if alphabet is restricted to
// alphanumeric chars.
func RandString(rng *rand.Rand, length int, alphabet string) string {
        runes := []rune(alphabet)
        var buf strings.Builder
        buf.Grow(length)
        for i := 0; i < length; i++ {
                buf.WriteRune(runes[rng.Intn(len(runes))])
        }
        return buf.String()
}

// SeedForTests seeds the random number generator and prints the seed
// value used. This function should be called from TestMain; individual tests
// should not touch the seed of the global random number generator.
func SeedForTests() {
        //lint:ignore SA1019 deprecated
        rand.Seed(globalSeed)
        log.Printf("random seed: %v", globalSeed)
}

// getTestName returns the calling test function name, returning an empty string
// if not found. The number of calls up the call stack is limited.
func getTestName() string {
        pcs := make([]uintptr, 10)
        n := runtime.Callers(2, pcs)
        frames := runtime.CallersFrames(pcs[:n])
        for {
                frame, more := frames.Next()
                fxn := frame.Function
                if strings.Contains(fxn, ".Test") {
                        return fxn
                }
                if !more {
                        break
                }
        }
        return ""
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import "reflect"

// EqualPtrFields uses reflection to check two "mirror" structures for matching pointer fields that
// point to the same object. Used to verify cloning/deep copy functions.
//
// Returns the names of equal pointer fields.
func EqualPtrFields(src, dst reflect.Value, prefix string) []string {
        t := dst.Type()
        if t.Kind() != reflect.Struct {
                return nil
        }
        if srcType := src.Type(); srcType != t {
                return nil
        }
        var res []string
        for i := 0; i < t.NumField(); i++ {
                srcF, dstF := src.Field(i), dst.Field(i)
                switch f := t.Field(i); f.Type.Kind() {
                case reflect.Ptr:
                        if srcF.Interface() == dstF.Interface() {
                                res = append(res, prefix+f.Name)
                        }
                case reflect.Slice:
                        if srcF.Pointer() == dstF.Pointer() {
                                res = append(res, prefix+f.Name)
                        }
                        l := dstF.Len()
                        if srcLen := srcF.Len(); srcLen < l {
                                l = srcLen
                        }
                        for i := 0; i < l; i++ {
                                res = append(res, EqualPtrFields(srcF.Index(i), dstF.Index(i), f.Name+".")...)
                        }
                case reflect.Struct:
                        res = append(res, EqualPtrFields(srcF, dstF, f.Name+".")...)
                }
        }
        return res
}

// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import (
        "cmp"
        "slices"
)

// CombineUnique merges two ordered slices. If both slices have unique elements
// then so does the resulting slice. More generally, each element is present
// max(timesInA, timesInB) times.
//
// Takes ownership of both slices, and uses the longer one to store the result.
//
// This function is used to combine slices where one of the slices is small or
// has mostly the same elements as the other. If the two slices are large and
// don't have many duplicates, this function should be avoided, because of the
// usage of `copy` that can increase CPU.
func CombineUnique[T cmp.Ordered](a, b []T) []T {
        // We want b to be the smaller slice, so there are fewer elements to be added.
        if len(b) > len(a) {
                b, a = a, b
        }
        aIter, bIter := 0, 0
        for aIter < len(a) && bIter < len(b) {
                if a[aIter] == b[bIter] {
                        aIter++
                        bIter++
                } else if a[aIter] < b[bIter] {
                        aIter++
                } else {
                        var zero T
                        a = append(a, zero)
                        copy(a[aIter+1:], a[aIter:])
                        a[aIter] = b[bIter]
                        aIter++
                        bIter++
                }
        }
        if bIter < len(b) {
                a = append(a, b[bIter:]...)
        }
        return a
}

// Filter returns a new slice that only contains elements from collection that
// satisfy predicate.
//
//        // Filter in place
//        numbers = Filter(numbers, isEven)
//        // Filter into a new slice
//        odds := Filter(numbers, isEven)
func Filter[T any](collection []T, predicate func(T) bool) []T {
        i := 0
        out := make([]T, len(collection))
        for j := range collection {
                if predicate(collection[j]) {
                        out[i] = collection[j]
                        i++
                }
        }
        return slices.Clip(out[:i])
}

// Map returns a new slice containing the results of fn for each element within
// collection. Usage:
//
//        Map([]int{1, 2, 3}, func(i int) int {
//                return i
//        })
func Map[T, K any](collection []T, fn func(T) K) []K {
        out := make([]K, len(collection))
        for i, el := range collection {
                out[i] = fn(el)
        }
        return out
}

// MapFrom returns a map populated with keys and values returned by fn.
// Usage:
//
//        // Construct a set.
//        MapFrom(numbers, func(i int) (int, struct{}) {
//                return i, struct{}{}
//        })
//
//        // Construct a map of numbers to their square.
//        MapFrom(numbers, func(i int) (int, int) {
//                return i, i * i
//        })
func MapFrom[T any, K comparable, V any](collection []T, fn func(T) (K, V)) map[K]V {
        out := make(map[K]V, len(collection))
        for _, el := range collection {
                key, value := fn(el)
                out[key] = value
        }
        return out
}

// InsertUnique inserts an element into an ordered slice if the element is not
// already present while maintaining the ordering property. Possibly updated
// slice is returned.
func InsertUnique[T cmp.Ordered](s []T, v T) []T {
        idx, found := slices.BinarySearch(s, v)
        if found {
                return s
        }
        return slices.Insert(s, idx, v)
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import (
        "runtime"
        "strings"

        "github.com/cockroachdb/redact"
)

var prefix = func() string {
        result := "github.com/cockroachdb/cockroach/pkg/"
        if runtime.Compiler == "gccgo" {
                result = strings.Replace(result, ".", "_", -1)
                result = strings.Replace(result, "/", "_", -1)
        }
        return result
}()

// GetSmallTrace returns a comma-separated string containing the top
// 5 callers from a given skip level.
func GetSmallTrace(skip int) redact.RedactableString {
        var pcs [5]uintptr
        runtime.Callers(skip, pcs[:])
        frames := runtime.CallersFrames(pcs[:])
        var callers redact.StringBuilder

        var callerPrefix redact.RedactableString
        for {
                f, more := frames.Next()
                function := strings.TrimPrefix(f.Function, prefix)
                file := f.File
                if index := strings.LastIndexByte(file, '/'); index >= 0 {
                        file = file[index+1:]
                }
                callers.Printf("%s%s:%d:%s", callerPrefix, redact.SafeString(file), f.Line, redact.SafeString(function))
                callerPrefix = ","
                if !more {
                        break
                }
        }

        return callers.RedactableString()
}

// Code generated by go_generics. DO NOT EDIT.

// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package span

import (
        "bytes"
        "sort"
        "strings"
        "sync"
        "sync/atomic"
)

// nilT is a nil instance of the Template type.
var nilT *btreeFrontierEntry

const (
        degree   = 16
        maxItems = 2*degree - 1
        minItems = degree - 1
)

// compare returns a value indicating the sort order relationship between
// a and b. The comparison is performed lexicographically on
//
//        (a.Key(), a.EndKey(), a.ID())
//
// and
//
//        (b.Key(), b.EndKey(), b.ID())
//
// tuples.
//
// Given c = compare(a, b):
//
//        c == -1  if (a.Key(), a.EndKey(), a.ID()) <  (b.Key(), b.EndKey(), b.ID())
//        c ==  0  if (a.Key(), a.EndKey(), a.ID()) == (b.Key(), b.EndKey(), b.ID())
//        c ==  1  if (a.Key(), a.EndKey(), a.ID()) >  (b.Key(), b.EndKey(), b.ID())
func compare(a, b *btreeFrontierEntry) int {
        c := bytes.Compare(a.Key(), b.Key())
        if c != 0 {
                return c
        }
        c = bytes.Compare(a.EndKey(), b.EndKey())
        if c != 0 {
                return c
        }
        if a.ID() < b.ID() {
                return -1
        } else if a.ID() > b.ID() {
                return 1
        } else {
                return 0
        }
}

// keyBound represents the upper-bound of a key range.
type keyBound struct {
        key []byte
        inc bool
}

func (b keyBound) compare(o keyBound) int {
        c := bytes.Compare(b.key, o.key)
        if c != 0 {
                return c
        }
        if b.inc == o.inc {
                return 0
        }
        if b.inc {
                return 1
        }
        return -1
}

func (b keyBound) contains(a *btreeFrontierEntry) bool {
        c := bytes.Compare(a.Key(), b.key)
        if c == 0 {
                return b.inc
        }
        return c < 0
}

func upperBound(c *btreeFrontierEntry) keyBound {
        if len(c.EndKey()) != 0 {
                return keyBound{key: c.EndKey()}
        }
        return keyBound{key: c.Key(), inc: true}
}

type node struct {
        ref   int32
        count int16

        // These fields form a keyBound, but by inlining them into node we can avoid
        // the extra word that would be needed to pad out maxInc if it were part of
        // its own struct.
        maxInc bool
        maxKey []byte

        items [maxItems]*btreeFrontierEntry

        // The children array pointer is only populated for interior nodes; it is nil
        // for leaf nodes.
        children *childrenArray
}

type childrenArray = [maxItems + 1]*node

var leafPool = sync.Pool{
        New: func() interface{} {
                return new(node)
        },
}

var nodePool = sync.Pool{
        New: func() interface{} {
                type interiorNode struct {
                        node
                        children childrenArray
                }
                n := new(interiorNode)
                n.node.children = &n.children
                return &n.node
        },
}

func newLeafNode() *node {
        n := leafPool.Get().(*node)
        n.ref = 1
        return n
}

func newNode() *node {
        n := nodePool.Get().(*node)
        n.ref = 1
        return n
}

// mut creates and returns a mutable node reference. If the node is not shared
// with any other trees then it can be modified in place. Otherwise, it must be
// cloned to ensure unique ownership. In this way, we enforce a copy-on-write
// policy which transparently incorporates the idea of local mutations, like
// Clojure's transients or Haskell's ST monad, where nodes are only copied
// during the first time that they are modified between Clone operations.
//
// When a node is cloned, the provided pointer will be redirected to the new
// mutable node.
func mut(n **node) *node {
        if atomic.LoadInt32(&(*n).ref) == 1 {
                // Exclusive ownership. Can mutate in place.
                return *n
        }
        // If we do not have unique ownership over the node then we
        // clone it to gain unique ownership. After doing so, we can
        // release our reference to the old node. We pass recursive
        // as true because even though we just observed the node's
        // reference count to be greater than 1, we might be racing
        // with another call to decRef on this node.
        c := (*n).clone()
        (*n).decRef(true /* recursive */)
        *n = c
        return *n
}

// leaf returns true if this is a leaf node.
func (n *node) leaf() bool {
        return n.children == nil
}

// max returns the maximum keyBound in the subtree rooted at this node.
func (n *node) max() keyBound {
        return keyBound{
                key: n.maxKey,
                inc: n.maxInc,
        }
}

// setMax sets the maximum keyBound for the subtree rooted at this node.
func (n *node) setMax(k keyBound) {
        n.maxKey = k.key
        n.maxInc = k.inc
}

// incRef acquires a reference to the node.
func (n *node) incRef() {
        atomic.AddInt32(&n.ref, 1)
}

// decRef releases a reference to the node. If requested, the method
// will recurse into child nodes and decrease their refcounts as well.
func (n *node) decRef(recursive bool) {
        if atomic.AddInt32(&n.ref, -1) > 0 {
                // Other references remain. Can't free.
                return
        }
        // Clear and release node into memory pool.
        if n.leaf() {
                *n = node{}
                leafPool.Put(n)
        } else {
                // Release child references first, if requested.
                if recursive {
                        for i := int16(0); i <= n.count; i++ {
                                n.children[i].decRef(true /* recursive */)
                        }
                }
                *n = node{children: n.children}
                *n.children = childrenArray{}
                nodePool.Put(n)
        }
}

// clone creates a clone of the receiver with a single reference count.
func (n *node) clone() *node {
        var c *node
        if n.leaf() {
                c = newLeafNode()
        } else {
                c = newNode()
        }
        // NB: copy field-by-field without touching n.ref to avoid
        // triggering the race detector and looking like a data race.
        c.count = n.count
        c.maxKey = n.maxKey
        c.maxInc = n.maxInc
        c.items = n.items
        if !c.leaf() {
                // Copy children and increase each refcount.
                *c.children = *n.children
                for i := int16(0); i <= c.count; i++ {
                        c.children[i].incRef()
                }
        }
        return c
}

func (n *node) insertAt(index int, item *btreeFrontierEntry, nd *node) {
        if index < int(n.count) {
                copy(n.items[index+1:n.count+1], n.items[index:n.count])
                if !n.leaf() {
                        copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
                }
        }
        n.items[index] = item
        if !n.leaf() {
                n.children[index+1] = nd
        }
        n.count++
}

func (n *node) pushBack(item *btreeFrontierEntry, nd *node) {
        n.items[n.count] = item
        if !n.leaf() {
                n.children[n.count+1] = nd
        }
        n.count++
}

func (n *node) pushFront(item *btreeFrontierEntry, nd *node) {
        if !n.leaf() {
                copy(n.children[1:n.count+2], n.children[:n.count+1])
                n.children[0] = nd
        }
        copy(n.items[1:n.count+1], n.items[:n.count])
        n.items[0] = item
        n.count++
}

// removeAt removes a value at a given index, pulling all subsequent values
// back.
func (n *node) removeAt(index int) (*btreeFrontierEntry, *node) {
        var child *node
        if !n.leaf() {
                child = n.children[index+1]
                copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
                n.children[n.count] = nil
        }
        n.count--
        out := n.items[index]
        copy(n.items[index:n.count], n.items[index+1:n.count+1])
        n.items[n.count] = nilT
        return out, child
}

// popBack removes and returns the last element in the list.
func (n *node) popBack() (*btreeFrontierEntry, *node) {
        n.count--
        out := n.items[n.count]
        n.items[n.count] = nilT
        if n.leaf() {
                return out, nil
        }
        child := n.children[n.count+1]
        n.children[n.count+1] = nil
        return out, child
}

// popFront removes and returns the first element in the list.
func (n *node) popFront() (*btreeFrontierEntry, *node) {
        n.count--
        var child *node
        if !n.leaf() {
                child = n.children[0]
                copy(n.children[:n.count+1], n.children[1:n.count+2])
                n.children[n.count+1] = nil
        }
        out := n.items[0]
        copy(n.items[:n.count], n.items[1:n.count+1])
        n.items[n.count] = nilT
        return out, child
}

// find returns the index where the given item should be inserted into this
// list. 'found' is true if the item already exists in the list at the given
// index.
func (n *node) find(item *btreeFrontierEntry) (index int, found bool) {
        // Logic copied from sort.Search. Inlining this gave
        // an 11% speedup on BenchmarkBTreeDeleteInsert.
        i, j := 0, int(n.count)
        for i < j {
                h := int(uint(i+j) >> 1) // avoid overflow when computing h
                // i ≤ h < j
                v := compare(item, n.items[h])
                if v == 0 {
                        return h, true
                } else if v > 0 {
                        i = h + 1
                } else {
                        j = h
                }
        }
        return i, false
}

// split splits the given node at the given index. The current node shrinks,
// and this function returns the item that existed at that index and a new
// node containing all items/children after it.
//
// Before:
//
//        +-----------+
//        |   x y z   |
//        +--/-/-\-\--+
//
// After:
//
//        +-----------+
//        |     y     |
//        +----/-\----+
//            /   \
//           v     v
//
// +-----------+     +-----------+
// |         x |     | z         |
// +-----------+     +-----------+
func (n *node) split(i int) (*btreeFrontierEntry, *node) {
        out := n.items[i]
        var next *node
        if n.leaf() {
                next = newLeafNode()
        } else {
                next = newNode()
        }
        next.count = n.count - int16(i+1)
        copy(next.items[:], n.items[i+1:n.count])
        for j := int16(i); j < n.count; j++ {
                n.items[j] = nilT
        }
        if !n.leaf() {
                copy(next.children[:], n.children[i+1:n.count+1])
                for j := int16(i + 1); j <= n.count; j++ {
                        n.children[j] = nil
                }
        }
        n.count = int16(i)

        nextMax := next.findUpperBound()
        next.setMax(nextMax)
        nMax := n.max()
        if nMax.compare(nextMax) != 0 && nMax.compare(upperBound(out)) != 0 {
                // If upper bound wasn't from new node or item
                // at index i, it must still be from old node.
        } else {
                n.setMax(n.findUpperBound())
        }
        return out, next
}

// insert inserts an item into the subtree rooted at this node, making sure no
// nodes in the subtree exceed maxItems items. Returns true if an existing item
// was replaced and false if an item was inserted. Also returns whether the
// node's upper bound changes.
func (n *node) insert(item *btreeFrontierEntry) (replaced, newBound bool) {
        i, found := n.find(item)
        if found {
                n.items[i] = item
                return true, false
        }
        if n.leaf() {
                n.insertAt(i, item, nil)
                return false, n.adjustUpperBoundOnInsertion(item, nil)
        }
        if n.children[i].count >= maxItems {
                splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2)
                n.insertAt(i, splitLa, splitNode)

                switch v := compare(item, n.items[i]); {
                case v < 0:
                        // no change, we want first split node
                case v > 0:
                        i++ // we want second split node
                default:
                        n.items[i] = item
                        return true, false
                }
        }
        replaced, newBound = mut(&n.children[i]).insert(item)
        if newBound {
                newBound = n.adjustUpperBoundOnInsertion(item, nil)
        }
        return replaced, newBound
}

// removeMax removes and returns the maximum item from the subtree rooted at
// this node.
func (n *node) removeMax() *btreeFrontierEntry {
        if n.leaf() {
                n.count--
                out := n.items[n.count]
                n.items[n.count] = nilT
                n.adjustUpperBoundOnRemoval(out, nil)
                return out
        }
        // Recurse into max child.
        i := int(n.count)
        if n.children[i].count <= minItems {
                // Child not large enough to remove from.
                n.rebalanceOrMerge(i)
                return n.removeMax() // redo
        }
        child := mut(&n.children[i])
        out := child.removeMax()
        n.adjustUpperBoundOnRemoval(out, nil)
        return out
}

// remove removes an item from the subtree rooted at this node. Returns the item
// that was removed or nil if no matching item was found. Also returns whether
// the node's upper bound changes.
func (n *node) remove(item *btreeFrontierEntry) (out *btreeFrontierEntry, newBound bool) {
        i, found := n.find(item)
        if n.leaf() {
                if found {
                        out, _ = n.removeAt(i)
                        return out, n.adjustUpperBoundOnRemoval(out, nil)
                }
                return nilT, false
        }
        if n.children[i].count <= minItems {
                // Child not large enough to remove from.
                n.rebalanceOrMerge(i)
                return n.remove(item) // redo
        }
        child := mut(&n.children[i])
        if found {
                // Replace the item being removed with the max item in our left child.
                out = n.items[i]
                n.items[i] = child.removeMax()
                return out, n.adjustUpperBoundOnRemoval(out, nil)
        }
        // Latch is not in this node and child is large enough to remove from.
        out, newBound = child.remove(item)
        if newBound {
                newBound = n.adjustUpperBoundOnRemoval(out, nil)
        }
        return out, newBound
}

// rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove
// an item from it while keeping it at or above minItems.
func (n *node) rebalanceOrMerge(i int) {
        switch {
        case i > 0 && n.children[i-1].count > minItems:
                // Rebalance from left sibling.
                //
                //          +-----------+
                //          |     y     |
                //          +----/-\----+
                //              /   \
                //             v     v
                // +-----------+     +-----------+
                // |         x |     |           |
                // +----------\+     +-----------+
                //             \
                //              v
                //              a
                //
                // After:
                //
                //          +-----------+
                //          |     x     |
                //          +----/-\----+
                //              /   \
                //             v     v
                // +-----------+     +-----------+
                // |           |     | y         |
                // +-----------+     +/----------+
                //                   /
                //                  v
                //                  a
                //
                left := mut(&n.children[i-1])
                child := mut(&n.children[i])
                xLa, grandChild := left.popBack()
                yLa := n.items[i-1]
                child.pushFront(yLa, grandChild)
                n.items[i-1] = xLa

                left.adjustUpperBoundOnRemoval(xLa, grandChild)
                child.adjustUpperBoundOnInsertion(yLa, grandChild)

        case i < int(n.count) && n.children[i+1].count > minItems:
                // Rebalance from right sibling.
                //
                //          +-----------+
                //          |     y     |
                //          +----/-\----+
                //              /   \
                //             v     v
                // +-----------+     +-----------+
                // |           |     | x         |
                // +-----------+     +/----------+
                //                   /
                //                  v
                //                  a
                //
                // After:
                //
                //          +-----------+
                //          |     x     |
                //          +----/-\----+
                //              /   \
                //             v     v
                // +-----------+     +-----------+
                // |         y |     |           |
                // +----------\+     +-----------+
                //             \
                //              v
                //              a
                //
                right := mut(&n.children[i+1])
                child := mut(&n.children[i])
                xLa, grandChild := right.popFront()
                yLa := n.items[i]
                child.pushBack(yLa, grandChild)
                n.items[i] = xLa

                right.adjustUpperBoundOnRemoval(xLa, grandChild)
                child.adjustUpperBoundOnInsertion(yLa, grandChild)

        default:
                // Merge with either the left or right sibling.
                //
                //          +-----------+
                //          |   u y v   |
                //          +----/-\----+
                //              /   \
                //             v     v
                // +-----------+     +-----------+
                // |         x |     | z         |
                // +-----------+     +-----------+
                //
                // After:
                //
                //          +-----------+
                //          |    u v    |
                //          +-----|-----+
                //                |
                //                v
                //          +-----------+
                //          |   x y z   |
                //          +-----------+
                //
                if i >= int(n.count) {
                        i = int(n.count - 1)
                }
                child := mut(&n.children[i])
                // Make mergeChild mutable, bumping the refcounts on its children if necessary.
                _ = mut(&n.children[i+1])
                mergeLa, mergeChild := n.removeAt(i)
                child.items[child.count] = mergeLa
                copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count])
                if !child.leaf() {
                        copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
                }
                child.count += mergeChild.count + 1

                child.adjustUpperBoundOnInsertion(mergeLa, mergeChild)
                mergeChild.decRef(false /* recursive */)
        }
}

// findUpperBound returns the largest end key node range, assuming that its
// children have correct upper bounds already set.
func (n *node) findUpperBound() keyBound {
        var max keyBound
        for i := int16(0); i < n.count; i++ {
                up := upperBound(n.items[i])
                if max.compare(up) < 0 {
                        max = up
                }
        }
        if !n.leaf() {
                for i := int16(0); i <= n.count; i++ {
                        up := n.children[i].max()
                        if max.compare(up) < 0 {
                                max = up
                        }
                }
        }
        return max
}

// adjustUpperBoundOnInsertion adjusts the upper key bound for this node given
// an item and an optional child node that was inserted. Returns true is the
// upper bound was changed and false if not.
func (n *node) adjustUpperBoundOnInsertion(item *btreeFrontierEntry, child *node) bool {
        up := upperBound(item)
        if child != nil {
                if childMax := child.max(); up.compare(childMax) < 0 {
                        up = childMax
                }
        }
        if n.max().compare(up) < 0 {
                n.setMax(up)
                return true
        }
        return false
}

// adjustUpperBoundOnRemoval adjusts the upper key bound for this node given an
// item and an optional child node that was removed. Returns true is the upper
// bound was changed and false if not.
func (n *node) adjustUpperBoundOnRemoval(item *btreeFrontierEntry, child *node) bool {
        up := upperBound(item)
        if child != nil {
                if childMax := child.max(); up.compare(childMax) < 0 {
                        up = childMax
                }
        }
        if n.max().compare(up) == 0 {
                // up was previous upper bound of n.
                max := n.findUpperBound()
                n.setMax(max)
                return max.compare(up) != 0
        }
        return false
}

// btree is an implementation of an augmented interval B-Tree.
//
// btree stores items in an ordered structure, allowing easy insertion,
// removal, and iteration. It represents intervals and permits an interval
// search operation following the approach laid out in CLRS, Chapter 14.
// The B-Tree stores items in order based on their start key and each
// B-Tree node maintains the upper-bound end key of all items in its
// subtree.
//
// Write operations are not safe for concurrent mutation by multiple
// goroutines, but Read operations are.
type btree struct {
        root   *node
        length int
}

// Reset removes all items from the btree. In doing so, it allows memory
// held by the btree to be recycled. Failure to call this method before
// letting a btree be GCed is safe in that it won't cause a memory leak,
// but it will prevent btree nodes from being efficiently re-used.
func (t *btree) Reset() {
        if t.root != nil {
                t.root.decRef(true /* recursive */)
                t.root = nil
        }
        t.length = 0
}

// Clone clones the btree, lazily. It does so in constant time.
func (t *btree) Clone() btree {
        c := *t
        if c.root != nil {
                // Incrementing the reference count on the root node is sufficient to
                // ensure that no node in the cloned tree can be mutated by an actor
                // holding a reference to the original tree and vice versa. This
                // property is upheld because the root node in the receiver btree and
                // the returned btree will both necessarily have a reference count of at
                // least 2 when this method returns. All tree mutations recursively
                // acquire mutable node references (see mut) as they traverse down the
                // tree. The act of acquiring a mutable node reference performs a clone
                // if a node's reference count is greater than one. Cloning a node (see
                // clone) increases the reference count on each of its children,
                // ensuring that they have a reference count of at least 2. This, in
                // turn, ensures that any of the child nodes that are modified will also
                // be copied-on-write, recursively ensuring the immutability property
                // over the entire tree.
                c.root.incRef()
        }
        return c
}

// Delete removes an item equal to the passed in item from the tree.
func (t *btree) Delete(item *btreeFrontierEntry) {
        if t.root == nil || t.root.count == 0 {
                return
        }
        if out, _ := mut(&t.root).remove(item); out != nilT {
                t.length--
        }
        if t.root.count == 0 {
                old := t.root
                if t.root.leaf() {
                        t.root = nil
                } else {
                        t.root = t.root.children[0]
                }
                old.decRef(false /* recursive */)
        }
}

// Set adds the given item to the tree. If an item in the tree already equals
// the given one, it is replaced with the new item.
func (t *btree) Set(item *btreeFrontierEntry) {
        if t.root == nil {
                t.root = newLeafNode()
        } else if t.root.count >= maxItems {
                splitLa, splitNode := mut(&t.root).split(maxItems / 2)
                newRoot := newNode()
                newRoot.count = 1
                newRoot.items[0] = splitLa
                newRoot.children[0] = t.root
                newRoot.children[1] = splitNode
                newRoot.setMax(newRoot.findUpperBound())
                t.root = newRoot
        }
        if replaced, _ := mut(&t.root).insert(item); !replaced {
                t.length++
        }
}

// MakeIter returns a new iterator object. It is not safe to continue using an
// iterator after modifications are made to the tree. If modifications are made,
// create a new iterator.
func (t *btree) MakeIter() iterator {
        return iterator{r: t.root, pos: -1}
}

// Height returns the height of the tree.
func (t *btree) Height() int {
        if t.root == nil {
                return 0
        }
        h := 1
        n := t.root
        for !n.leaf() {
                n = n.children[0]
                h++
        }
        return h
}

// Len returns the number of items currently in the tree.
func (t *btree) Len() int {
        return t.length
}

// String returns a string description of the tree. The format is
// similar to the https://en.wikipedia.org/wiki/Newick_format.
func (t *btree) String() string {
        if t.length == 0 {
                return ";"
        }
        var b strings.Builder
        t.root.writeString(&b)
        return b.String()
}

func (n *node) writeString(b *strings.Builder) {
        if n.leaf() {
                for i := int16(0); i < n.count; i++ {
                        if i != 0 {
                                b.WriteString(",")
                        }
                        b.WriteString(n.items[i].String())
                }
                return
        }
        for i := int16(0); i <= n.count; i++ {
                b.WriteString("(")
                n.children[i].writeString(b)
                b.WriteString(")")
                if i < n.count {
                        b.WriteString(n.items[i].String())
                }
        }
}

// iterStack represents a stack of (node, pos) tuples, which captures
// iteration state as an iterator descends a btree.
type iterStack struct {
        a    iterStackArr
        aLen int16 // -1 when using s
        s    []iterFrame
}

// Used to avoid allocations for stacks below a certain size.
type iterStackArr [3]iterFrame

type iterFrame struct {
        n   *node
        pos int16
}

func (is *iterStack) push(f iterFrame) {
        if is.aLen == -1 {
                is.s = append(is.s, f)
        } else if int(is.aLen) == len(is.a) {
                is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen))
                copy(is.s, is.a[:])
                is.s[int(is.aLen)] = f
                is.aLen = -1
        } else {
                is.a[is.aLen] = f
                is.aLen++
        }
}

func (is *iterStack) pop() iterFrame {
        if is.aLen == -1 {
                f := is.s[len(is.s)-1]
                is.s = is.s[:len(is.s)-1]
                return f
        }
        is.aLen--
        return is.a[is.aLen]
}

func (is *iterStack) len() int {
        if is.aLen == -1 {
                return len(is.s)
        }
        return int(is.aLen)
}

func (is *iterStack) reset() {
        if is.aLen == -1 {
                is.s = is.s[:0]
        } else {
                is.aLen = 0
        }
}

// iterator is responsible for search and traversal within a btree.
type iterator struct {
        r   *node
        n   *node
        pos int16
        s   iterStack
        o   overlapScan
}

func (i *iterator) reset() {
        i.n = i.r
        i.pos = -1
        i.s.reset()
        i.o = overlapScan{}
}

func (i *iterator) descend(n *node, pos int16) {
        i.s.push(iterFrame{n: n, pos: pos})
        i.n = n.children[pos]
        i.pos = 0
}

// ascend ascends up to the current node's parent and resets the position
// to the one previously set for this parent node.
func (i *iterator) ascend() {
        f := i.s.pop()
        i.n = f.n
        i.pos = f.pos
}

// SeekGE seeks to the first item greater-than or equal to the provided
// item.
func (i *iterator) SeekGE(item *btreeFrontierEntry) {
        i.reset()
        if i.n == nil {
                return
        }
        for {
                pos, found := i.n.find(item)
                i.pos = int16(pos)
                if found {
                        return
                }
                if i.n.leaf() {
                        if i.pos == i.n.count {
                                i.Next()
                        }
                        return
                }
                i.descend(i.n, i.pos)
        }
}

// SeekLT seeks to the first item less-than the provided item.
func (i *iterator) SeekLT(item *btreeFrontierEntry) {
        i.reset()
        if i.n == nil {
                return
        }
        for {
                pos, found := i.n.find(item)
                i.pos = int16(pos)
                if found || i.n.leaf() {
                        i.Prev()
                        return
                }
                i.descend(i.n, i.pos)
        }
}

// First seeks to the first item in the btree.
func (i *iterator) First() {
        i.reset()
        if i.n == nil {
                return
        }
        for !i.n.leaf() {
                i.descend(i.n, 0)
        }
        i.pos = 0
}

// Last seeks to the last item in the btree.
func (i *iterator) Last() {
        i.reset()
        if i.n == nil {
                return
        }
        for !i.n.leaf() {
                i.descend(i.n, i.n.count)
        }
        i.pos = i.n.count - 1
}

// Next positions the iterator to the item immediately following
// its current position.
func (i *iterator) Next() {
        if i.n == nil {
                return
        }

        if i.n.leaf() {
                i.pos++
                if i.pos < i.n.count {
                        return
                }
                for i.s.len() > 0 && i.pos >= i.n.count {
                        i.ascend()
                }
                return
        }

        i.descend(i.n, i.pos+1)
        for !i.n.leaf() {
                i.descend(i.n, 0)
        }
        i.pos = 0
}

// Prev positions the iterator to the item immediately preceding
// its current position.
func (i *iterator) Prev() {
        if i.n == nil {
                return
        }

        if i.n.leaf() {
                i.pos--
                if i.pos >= 0 {
                        return
                }
                for i.s.len() > 0 && i.pos < 0 {
                        i.ascend()
                        i.pos--
                }
                return
        }

        i.descend(i.n, i.pos)
        for !i.n.leaf() {
                i.descend(i.n, i.n.count)
        }
        i.pos = i.n.count - 1
}

// Valid returns whether the iterator is positioned at a valid position.
func (i *iterator) Valid() bool {
        return i.pos >= 0 && i.pos < i.n.count
}

// Cur returns the item at the iterator's current position. It is illegal
// to call Cur if the iterator is not valid.
func (i *iterator) Cur() *btreeFrontierEntry {
        return i.n.items[i.pos]
}

// An overlap scan is a scan over all items that overlap with the provided
// item in order of the overlapping items' start keys. The goal of the scan
// is to minimize the number of key comparisons performed in total. The
// algorithm operates based on the following two invariants maintained by
// augmented interval btree:
//  1. all items are sorted in the btree based on their start key.
//  2. all btree nodes maintain the upper bound end key of all items
//     in their subtree.
//
// The scan algorithm starts in "unconstrained minimum" and "unconstrained
// maximum" states. To enter a "constrained minimum" state, the scan must reach
// items in the tree with start keys above the search range's start key.
// Because items in the tree are sorted by start key, once the scan enters the
// "constrained minimum" state it will remain there. To enter a "constrained
// maximum" state, the scan must determine the first child btree node in a given
// subtree that can have items with start keys above the search range's end
// key. The scan then remains in the "constrained maximum" state until it
// traverse into this child node, at which point it moves to the "unconstrained
// maximum" state again.
//
// The scan algorithm works like a standard btree forward scan with the
// following augmentations:
//  1. before tranversing the tree, the scan performs a binary search on the
//     root node's items to determine a "soft" lower-bound constraint position
//     and a "hard" upper-bound constraint position in the root's children.
//  2. when tranversing into a child node in the lower or upper bound constraint
//     position, the constraint is refined by searching the child's items.
//  3. the initial traversal down the tree follows the left-most children
//     whose upper bound end keys are equal to or greater than the start key
//     of the search range. The children followed will be equal to or less
//     than the soft lower bound constraint.
//  4. once the initial tranversal completes and the scan is in the left-most
//     btree node whose upper bound overlaps the search range, key comparisons
//     must be performed with each item in the tree. This is necessary because
//     any of these items may have end keys that cause them to overlap with the
//     search range.
//  5. once the scan reaches the lower bound constraint position (the first item
//     with a start key equal to or greater than the search range's start key),
//     it can begin scaning without performing key comparisons. This is allowed
//     because all items from this point forward will have end keys that are
//     greater than the search range's start key.
//  6. once the scan reaches the upper bound constraint position, it terminates.
//     It does so because the item at this position is the first item with a
//     start key larger than the search range's end key.
type overlapScan struct {
        // The "soft" lower-bound constraint.
        constrMinN       *node
        constrMinPos     int16
        constrMinReached bool

        // The "hard" upper-bound constraint.
        constrMaxN   *node
        constrMaxPos int16
}

// FirstOverlap seeks to the first item in the btree that overlaps with the
// provided search item.
func (i *iterator) FirstOverlap(item *btreeFrontierEntry) {
        i.reset()
        if i.n == nil {
                return
        }
        i.pos = 0
        i.o = overlapScan{}
        i.constrainMinSearchBounds(item)
        i.constrainMaxSearchBounds(item)
        i.findNextOverlap(item)
}

// NextOverlap positions the iterator to the item immediately following
// its current position that overlaps with the search item.
func (i *iterator) NextOverlap(item *btreeFrontierEntry) {
        if i.n == nil {
                return
        }
        i.pos++
        i.findNextOverlap(item)
}

func (i *iterator) constrainMinSearchBounds(item *btreeFrontierEntry) {
        k := item.Key()
        j := sort.Search(int(i.n.count), func(j int) bool {
                return bytes.Compare(k, i.n.items[j].Key()) <= 0
        })
        i.o.constrMinN = i.n
        i.o.constrMinPos = int16(j)
}

func (i *iterator) constrainMaxSearchBounds(item *btreeFrontierEntry) {
        up := upperBound(item)
        j := sort.Search(int(i.n.count), func(j int) bool {
                return !up.contains(i.n.items[j])
        })
        i.o.constrMaxN = i.n
        i.o.constrMaxPos = int16(j)
}

func (i *iterator) findNextOverlap(item *btreeFrontierEntry) {
        for {
                if i.pos > i.n.count {
                        // Iterate up tree.
                        i.ascend()
                } else if !i.n.leaf() {
                        // Iterate down tree.
                        if i.o.constrMinReached || i.n.children[i.pos].max().contains(item) {
                                par := i.n
                                pos := i.pos
                                i.descend(par, pos)

                                // Refine the constraint bounds, if necessary.
                                if par == i.o.constrMinN && pos == i.o.constrMinPos {
                                        i.constrainMinSearchBounds(item)
                                }
                                if par == i.o.constrMaxN && pos == i.o.constrMaxPos {
                                        i.constrainMaxSearchBounds(item)
                                }
                                continue
                        }
                }

                // Check search bounds.
                if i.n == i.o.constrMaxN && i.pos == i.o.constrMaxPos {
                        // Invalid. Past possible overlaps.
                        i.pos = i.n.count
                        return
                }
                if i.n == i.o.constrMinN && i.pos == i.o.constrMinPos {
                        // The scan reached the soft lower-bound constraint.
                        i.o.constrMinReached = true
                }

                // Iterate across node.
                if i.pos < i.n.count {
                        // Check for overlapping item.
                        if i.o.constrMinReached {
                                // Fast-path to avoid span comparison. i.o.constrMinReached
                                // tells us that all items have end keys above our search
                                // span's start key.
                                return
                        }
                        if upperBound(i.n.items[i.pos]).contains(item) {
                                return
                        }
                }
                i.pos++
        }
}

// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package span

import (
        "container/heap"
        "fmt"
        "strings"
        "sync"
        "sync/atomic"

        // Needed for roachpb.Span.String().
        _ "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/util/buildutil"
        "github.com/cockroachdb/cockroach/pkg/util/envutil"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/interval"
        "github.com/cockroachdb/cockroach/pkg/util/metamorphic"
        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
        "github.com/cockroachdb/errors"
)

// Frontier tracks the minimum timestamp of a set of spans.
// Frontier is not safe for concurrent modification, but MakeConcurrentFrontier
// can be used to make thread safe frontier.
type Frontier interface {
        // AddSpansAt adds the provided spans to the frontier at the provided timestamp.
        // If the span overlaps any spans already tracked by the frontier, the tree is adjusted
        // to hold union of the span and the overlaps, with all entries assigned startAt starting
        // timestamp.
        AddSpansAt(startAt hlc.Timestamp, spans ...roachpb.Span) error

        // Frontier returns the minimum timestamp being tracked.
        Frontier() hlc.Timestamp

        // PeekFrontierSpan returns one of the spans at the Frontier.
        PeekFrontierSpan() roachpb.Span

        // Forward advances the timestamp for a span. Any part of the span that doesn't
        // overlap the tracked span set will be ignored. True is returned if the
        // frontier advanced as a result.
        Forward(span roachpb.Span, ts hlc.Timestamp) (bool, error)

        // Release removes all items from the frontier. In doing so, it allows memory
        // held by the frontier to be recycled. Failure to call this method before
        // letting a frontier be GCed is safe in that it won't cause a memory leak,
        // but it will prevent frontier nodes from being efficiently re-used.
        Release()

        // Entries invokes the given callback with the current timestamp for each
        // component span in the tracked span set.
        // The fn may not mutate this frontier while iterating.
        Entries(fn Operation)

        // SpanEntries invokes op for each sub-span of the specified span with the
        // timestamp as observed by this frontier.
        //
        // Time
        // 5|      .b__c               .
        // 4|      .             h__k  .
        // 3|      .      e__f         .
        // 1 ---a----------------------m---q-- Frontier
        //
        //        |___________span___________|
        //
        // In the above example, frontier tracks [b, m) and the current frontier
        // timestamp is 1.  SpanEntries for span [a-q) will invoke op with:
        //
        //        ([b-c), 5), ([c-e), 1), ([e-f), 3], ([f, h], 1) ([h, k), 4), ([k, m), 1).
        //
        // Note: neither [a-b) nor [m, q) will be emitted since they do not intersect with the spans
        // tracked by this frontier.
        // The fn may not mutate this frontier while iterating.
        SpanEntries(span roachpb.Span, op Operation)

        // Len returns the number of spans tracked by the frontier.
        Len() int

        // String returns string representation of this fFrontier.
        String() string
}

// OpResult is the result of the Operation callback.
type OpResult bool

const (
        // ContinueMatch signals DoMatching should continue.
        ContinueMatch OpResult = false
        // StopMatch signals DoMatching should stop.
        StopMatch OpResult = true
)

func (r OpResult) asBool() bool {
        return bool(r)
}

// An Operation is a function that operates on a frontier spans. If done is returned true, the
// Operation is indicating that no further work needs to be done and so the DoMatching function
// should traverse no further.
type Operation func(roachpb.Span, hlc.Timestamp) (done OpResult)

var useBtreeFrontier = envutil.EnvOrDefaultBool("COCKROACH_BTREE_SPAN_FRONTIER_ENABLED",
        metamorphic.ConstantWithTestBool("COCKROACH_BTREE_SPAN_FRONTIER_ENABLED", true))

// EnableBtreeFrontier sets useBtreeFrontier for testing purposes.
func EnableBtreeFrontier(enabled bool) func() {
        old := useBtreeFrontier
        useBtreeFrontier = enabled
        return func() {
                useBtreeFrontier = old
        }
}

func newBtreeFrontier() Frontier {
        return &btreeFrontier{}
}

func newLLRBFrontier() Frontier {
        return &llrbFrontier{tree: interval.NewTree(interval.ExclusiveOverlapper)}
}

func newFrontier() Frontier {
        if useBtreeFrontier {
                return newBtreeFrontier()
        }
        return newLLRBFrontier()
}

// MakeFrontier returns a Frontier that tracks the given set of spans.
// Each span timestamp initialized at 0.
func MakeFrontier(spans ...roachpb.Span) (Frontier, error) {
        return MakeFrontierAt(hlc.Timestamp{}, spans...)
}

// MakeFrontierAt returns a Frontier that tracks the given set of spans.
// Each span timestamp initialized at specified start time.
func MakeFrontierAt(startAt hlc.Timestamp, spans ...roachpb.Span) (Frontier, error) {
        f := newFrontier()
        if err := f.AddSpansAt(startAt, spans...); err != nil {
                f.Release() // release whatever was allocated.
                return nil, err
        }
        return f, nil
}

// MakeConcurrentFrontier wraps provided frontier to make it safe to use concurrently.
func MakeConcurrentFrontier(f Frontier) Frontier {
        return &concurrentFrontier{f: f}
}

// btreeFrontier is a btree based implementation of Frontier.
type btreeFrontier struct {
        // tree contains `*btreeFrontierEntry` items for the entire currently tracked
        // span set. Any tracked spans that have never been `Forward`ed will have a
        // zero timestamp. If any entries needed to be split along a tracking
        // boundary, this has already been done by `forward` before it entered the
        // tree.
        tree btree
        // minHeap contains the same `*btreeFrontierEntry` items as `tree`. Entries
        // in the heap are sorted first by minimum timestamp and then by lesser
        // start key.
        minHeap frontierHeap

        idAlloc uint64

        mergeAlloc []*btreeFrontierEntry // Amortize allocations.

        // disallowMutationWhileIterating is set when iterating
        // over frontier entries.  Attempts to mutate this frontier
        // will panic under the test or return an error.
        disallowMutationWhileIterating atomic.Bool
}

// btreeFrontierEntry represents a timestamped span. It is used as the nodes in both
// the tree and heap needed to keep the Frontier.
// btreeFrontierEntry implements interval/generic interface.
type btreeFrontierEntry struct {
        Start, End roachpb.Key
        ts         hlc.Timestamp

        // id is a unique ID assigned to each frontier entry
        // (required by the underlying generic btree implementation).
        id uint64

        // The heapIdx of the item in the frontierHeap, maintained by the
        // heap.Interface methods.
        heapIdx int

        // spanCopy contains a copy of the user provided span.
        // This is used only under test to detect frontier mis-uses when
        // the caller mutates span keys after adding those spans to this frontier.
        spanCopy roachpb.Span
}

//go:generate ../interval/generic/gen.sh *btreeFrontierEntry span

// AddSpansAt adds the provided spans to the btreeFrontier at the provided timestamp.
// AddSpansAt deletes any overlapping spans already in the frontier.
//
// NB: It is *extremely* important for the caller to guarantee that the passed
// in spans (the underlying Key/EndKey []byte slices) are not modified in any
// way after this call. If modifications are made to the underlying key slices
// after the spans are added, the results are undefined -- anything from panic
// to infinite loops are possible. While this warning is scary, as it should be,
// the reality is that all callers so far, use the spans that come in from
// external source (an iterator, or RPC), and none of these callers ever modify
// the underlying keys.  If the caller has to modify underlying key slices, they
// must pass in the copy.
func (f *btreeFrontier) AddSpansAt(startAt hlc.Timestamp, spans ...roachpb.Span) (retErr error) {
        if err := f.checkDisallowedMutation(); err != nil {
                return err
        }

        if expensiveChecksEnabled() {
                defer func() {
                        if err := f.checkUnsafeKeyModification(); err != nil {
                                retErr = errors.CombineErrors(retErr, err)
                        }
                }()
        }

        for _, toAdd := range spans {
                // Validate caller provided span.
                if err := checkSpan(toAdd); err != nil {
                        return err
                }

                // Add toAdd sub-spans that do not overlap this frontier. To ensure that
                // adjacent spans are merged, sub-spans are added in two steps: first,
                // non-overlapping spans are added with 0 timestamp; then the timestamp for
                // the entire toAdd span is forwarded.
                for _, s := range spanDifference(toAdd, f) {
                        e := newFrontierEntry(&f.idAlloc, s.Key, s.EndKey, hlc.Timestamp{})
                        if err := f.setEntry(e); err != nil {
                                putFrontierEntry(e)
                                return err
                        }
                }
                if err := f.forward(toAdd, startAt); err != nil {
                        return err
                }
        }
        return nil
}

// Release removes all items from the btreeFrontier. In doing so, it allows memory
// held by the btreeFrontier to be recycled. Failure to call this method before
// letting a btreeFrontier be GCed is safe in that it won't cause a memory leak,
// but it will prevent btreeFrontier nodes from being efficiently re-used.
func (f *btreeFrontier) Release() {
        it := f.tree.MakeIter()
        for it.First(); it.Valid(); it.Next() {
                putFrontierEntry(it.Cur())
        }
        f.tree.Reset()
}

// Frontier returns the minimum timestamp being tracked.
func (f *btreeFrontier) Frontier() hlc.Timestamp {
        if f.minHeap.Len() == 0 {
                return hlc.Timestamp{}
        }
        return f.minHeap[0].ts
}

// PeekFrontierSpan returns one of the spans at the Frontier.
func (f *btreeFrontier) PeekFrontierSpan() roachpb.Span {
        if f.minHeap.Len() == 0 {
                return roachpb.Span{}
        }
        return f.minHeap[0].span()
}

// Forward advances the timestamp for a span. Any part of the span that doesn't
// overlap the tracked span set will be ignored. True is returned if the
// frontier advanced as a result.
//
// Note that internally, it may be necessary to use multiple entries to
// represent this timestamped span (e.g. if it overlaps with the tracked span
// set boundary). Similarly, an entry created by a previous Forward may be
// partially overlapped and have to be split into two entries.
//
// NB: it is unsafe for the caller to modify the keys in the provided span after this
// call returns.
func (f *btreeFrontier) Forward(
        span roachpb.Span, ts hlc.Timestamp,
) (forwarded bool, retErr error) {
        if err := f.checkDisallowedMutation(); err != nil {
                return false, err
        }

        // Validate caller provided span.
        if err := checkSpan(span); err != nil {
                return false, err
        }

        if expensiveChecksEnabled() {
                defer func() {
                        if err := f.checkUnsafeKeyModification(); err != nil {
                                retErr = errors.CombineErrors(retErr, err)
                        }
                }()
        }

        prevFrontier := f.Frontier()
        if err := f.forward(span, ts); err != nil {
                return false, err
        }
        return prevFrontier.Less(f.Frontier()), nil
}

// clone augments generated iterStack code to support cloning.
func (is *iterStack) clone() iterStack {
        c := *is
        c.s = append([]iterFrame(nil), is.s...) // copy stack.
        return c
}

// clone augments generated iterator code to support cloning.
func (i *iterator) clone() iterator {
        c := *i
        c.s = i.s.clone() // copy stack.
        return c
}

// mergeEntries searches for the entries to the left and to the right
// of the input entry that are contiguous to the entry range and have the same timestamp.
// Updates btree to include single merged entry.
// Any existing tree iterators and the passed in entry should be considered invalid after this call.
// Returns btreeFrontierEntry that replaced passed in entry.
func (f *btreeFrontier) mergeEntries(e *btreeFrontierEntry) (*btreeFrontierEntry, error) {
        defer func() {
                f.mergeAlloc = f.mergeAlloc[:0]
        }()

        // First, position iterator at e.
        pos := f.tree.MakeIter()
        pos.SeekGE(e)
        if !pos.Valid() || pos.Cur() != e {
                return nil, errors.AssertionFailedf("failed to find entry %s in btree", e)
        }

        // Now, search for contiguous spans to the left of e.
        leftMost := e
        leftIter := pos.clone()
        for leftIter.Prev(); leftIter.Valid(); leftIter.Prev() {
                if !(leftIter.Cur().End.Equal(leftMost.Start) && leftIter.Cur().ts.Equal(e.ts)) {
                        break
                }
                f.mergeAlloc = append(f.mergeAlloc, leftIter.Cur())
                leftMost = leftIter.Cur()
        }

        if leftMost != e {
                // We found ranges to the left of e that have the same timestamp.
                // That means that we'll merge entries into leftMost, and we will
                // also subsume e itself.  This assignment ensures that leftMost
                // entry is either an entry to the left of 'e' or the 'e' itself
                // and that leftMost is removed from the mergeAlloc so that it will
                // not be deleted below.
                f.mergeAlloc[len(f.mergeAlloc)-1] = e
        }

        // Now, continue to the right of e.
        end := e.End
        rightIter := pos.clone()
        for rightIter.Next(); rightIter.Valid(); rightIter.Next() {
                if !(rightIter.Cur().Start.Equal(end) && rightIter.Cur().ts.Equal(e.ts)) {
                        break
                }
                end = rightIter.Cur().End
                f.mergeAlloc = append(f.mergeAlloc, rightIter.Cur())
        }

        // If there were no left or right merges, return without restructuring the
        // tree.
        if len(f.mergeAlloc) == 0 {
                return leftMost, nil
        }

        // Delete entries first, before updating leftMost boundaries since doing so
        // will mess up btree.
        for i, toRemove := range f.mergeAlloc {
                f.mergeAlloc[i] = nil
                if err := f.deleteEntry(toRemove); err != nil {
                        return nil, err
                }
        }

        f.setEndKey(leftMost, end)

        return leftMost, nil
}

// setEntry adds entry to the tree and to the heap.
func (f *btreeFrontier) setEntry(e *btreeFrontierEntry) error {
        if expensiveChecksEnabled() {
                if err := checkSpan(e.span()); err != nil {
                        return err
                }
        }

        f.tree.Set(e)
        heap.Push(&f.minHeap, e)
        return nil
}

// deleteEntry removes entry from the tree and the heap, and releases this entry
// into the pool.
func (f *btreeFrontier) deleteEntry(e *btreeFrontierEntry) error {
        defer putFrontierEntry(e)

        if expensiveChecksEnabled() {
                if err := checkSpan(e.span()); err != nil {
                        return err
                }
        }

        heap.Remove(&f.minHeap, e.heapIdx)
        f.tree.Delete(e)
        return nil
}

// splitEntryAt splits entry at specified split point.
// Returns left and right entries.
// Any existing tree iterators are invalid after this call.
func (f *btreeFrontier) splitEntryAt(
        e *btreeFrontierEntry, split roachpb.Key,
) (left, right *btreeFrontierEntry, err error) {
        if expensiveChecksEnabled() {
                if !e.span().ContainsKey(split) {
                        return nil, nil, errors.AssertionFailedf(
                                "split key %s is not contained by %s", split, e.span())
                }
        }

        right = newFrontierEntry(&f.idAlloc, split, e.End, e.ts)

        // Adjust e boundary before we add right (so that there is no overlap in the
        // tree).
        f.setEndKey(e, split)

        if err := f.setEntry(right); err != nil {
                putFrontierEntry(right)
                return nil, nil, err
        }
        return e, right, nil
}

// setEndKey changes the end key assigned to the entry. setEndKey requires the
// entry to be in the tree.
func (f *btreeFrontier) setEndKey(e *btreeFrontierEntry, endKey roachpb.Key) {
        // The tree implementation expects the Start and End keys of a span to be
        // immutable. We remove the leftMost node before updating the End index in
        // order to avoid corrupting the `maxKey` that is inlined in the `node`.
        f.tree.Delete(e)
        e.End = endKey
        if expensiveChecksEnabled() {
                e.spanCopy.EndKey = append(roachpb.Key{}, endKey...)
        }
        f.tree.Set(e)
}

// forward is the work horse of the btreeFrontier.  It forwards the timestamp
// for the specified span, splitting, and merging btreeFrontierEntries as needed.
func (f *btreeFrontier) forward(span roachpb.Span, insertTS hlc.Timestamp) error {
        todoEntry := newSearchKey(span.Key, span.EndKey)
        defer putFrontierEntry(todoEntry)

        // forwardEntryTimestamp forwards timestamp to insertTS, and updates
        // tree to merge contiguous spans with the same timestamp (if possible).
        // NB: passed in entry and any existing iterators should be considered invalid
        // after this call.
        forwardEntryTimestamp := func(e *btreeFrontierEntry) (*btreeFrontierEntry, error) {
                e.ts = insertTS
                heap.Fix(&f.minHeap, e.heapIdx)
                return f.mergeEntries(e)
        }

        for !todoEntry.isEmptyRange() { // Keep going as long as there is work to be done.
                if expensiveChecksEnabled() {
                        if err := checkSpan(todoEntry.span()); err != nil {
                                return err
                        }
                }

                // Seek to the first entry overlapping todoEntry.
                it := f.tree.MakeIter()
                it.FirstOverlap(todoEntry)
                if !it.Valid() {
                        break
                }

                overlap := it.Cur()

                // Invariant (a): todoEntry.Start must be equal or after overlap.Start.
                // Trim todoEntry if it falls outside the span(s) tracked by this btreeFrontier.
                // This establishes the invariant that overlap start must be at or before todoEntry start.
                if todoEntry.Start.Compare(overlap.Start) < 0 {
                        todoEntry.Start = overlap.Start
                        if todoEntry.isEmptyRange() {
                                break
                        }
                }

                // Fast case: we already recorded higher timestamp for this overlap.
                if insertTS.LessEq(overlap.ts) {
                        todoEntry.Start = overlap.End
                        continue
                }

                // Fast case: we expect that most of the time, we forward timestamp for
                // stable ranges -- that is, we expect range split/merge are not that common.
                // As such, if the overlap range exactly matches todoEntry, we can simply
                // update overlap timestamp and be done.
                if overlap.span().Equal(todoEntry.span()) {
                        if _, err := forwardEntryTimestamp(overlap); err != nil {
                                return err
                        }
                        break
                }

                // At this point, we know that overlap timestamp is not ahead of the
                // insertTS (otherwise we'd hit fast case above).
                // We need to split overlap range into multiple parts.
                // 1. Possibly empty part before todoEntry.Start
                // 2. Middle part (with updated timestamp),
                // 3. Possibly empty part after todoEntry end.
                if overlap.Start.Compare(todoEntry.Start) < 0 {
                        // Split overlap into 2 entries
                        // [overlap.Start, todoEntry.Start) and [todoEntry.Start, overlap.End)
                        // Invariant (b): after this step, overlap is split into 2 parts.  The right
                        // part starts at todoEntry.Start.
                        _, _, err := f.splitEntryAt(overlap, todoEntry.Start)
                        if err != nil {
                                return err
                        }
                        continue
                }

                // NB: overlap.Start must be equal to todoEntry.Start (established by Invariant (a) and (b) above).
                if expensiveChecksEnabled() && !overlap.Start.Equal(todoEntry.Start) {
                        return errors.AssertionFailedf("expected overlap %s to start at %s", overlap, todoEntry)
                }

                switch cmp := todoEntry.End.Compare(overlap.End); {
                case cmp < 0:
                        // Our todoEntry ends before the overlap ends.
                        // Split overlap into 2 entries:
                        // [overlap.Start, todoEntry.End) and [todoEntry.End, overlap.End)
                        // Left entry can reuse overlap with insertTS.
                        left, right, err := f.splitEntryAt(overlap, todoEntry.End)
                        if err != nil {
                                return err
                        }
                        todoEntry.Start = right.End
                        // The left part advances its timestamp.
                        if _, err := forwardEntryTimestamp(left); err != nil {
                                return err
                        }
                case cmp >= 0:
                        // todoEntry ends at or beyond overlap.  Regardless, we can simply update overlap
                        // and if needed, continue matching remaining todoEntry (if any).
                        fwd, err := forwardEntryTimestamp(overlap)
                        if err != nil {
                                return err
                        }
                        todoEntry.Start = fwd.End
                }
        }
        return nil
}

func (f *btreeFrontier) disallowMutations() func() {
        f.disallowMutationWhileIterating.Store(true)
        return func() {
                f.disallowMutationWhileIterating.Store(false)
        }
}

// Entries invokes the given callback with the current timestamp for each
// component span in the tracked span set.
func (f *btreeFrontier) Entries(fn Operation) {
        defer f.disallowMutations()()

        it := f.tree.MakeIter()
        for it.First(); it.Valid(); it.Next() {
                if fn(it.Cur().span(), it.Cur().ts) == StopMatch {
                        break
                }
        }
}

// SpanEntries invokes op for each sub-span of the specified span with the
// timestamp as observed by this frontier.
//
// Time
// 5|      .b__c               .
// 4|      .             h__k  .
// 3|      .      e__f         .
// 1 ---a----------------------m---q-- Frontier
//
//        |___________span___________|
//
// In the above example, frontier tracks [b, m) and the current frontier
// timestamp is 1.  SpanEntries for span [a-q) will invoke op with:
//
//        ([b-c), 5), ([c-e), 1), ([e-f), 3], ([f, h], 1) ([h, k), 4), ([k, m), 1).
//
// Note: neither [a-b) nor [m, q) will be emitted since they do not intersect with the spans
// tracked by this frontier.
func (f *btreeFrontier) SpanEntries(span roachpb.Span, op Operation) {
        defer f.disallowMutations()()

        todoRange := newSearchKey(span.Key, span.EndKey)
        defer putFrontierEntry(todoRange)

        it := f.tree.MakeIter()
        for it.FirstOverlap(todoRange); it.Valid(); it.NextOverlap(todoRange) {
                e := it.Cur()

                // Skip untracked portion.
                if todoRange.Start.Compare(e.Start) < 0 {
                        todoRange.Start = e.Start
                }

                end := e.End
                if e.End.Compare(todoRange.End) > 0 {
                        end = todoRange.End
                }

                if op(roachpb.Span{Key: todoRange.Start, EndKey: end}, e.ts) == StopMatch {
                        return
                }
                todoRange.Start = end
        }
}

// String implements Stringer.
func (f *btreeFrontier) String() string {
        defer f.disallowMutations()()

        var buf strings.Builder
        it := f.tree.MakeIter()
        for it.First(); it.Valid(); it.Next() {
                if buf.Len() != 0 {
                        buf.WriteString(` `)
                }
                buf.WriteString(it.Cur().String())
        }
        return buf.String()
}

// Len implements Frontier.
func (f *btreeFrontier) Len() int {
        return f.tree.Len()
}

func (e *btreeFrontierEntry) ID() uint64 {
        return e.id
}

func (e *btreeFrontierEntry) Key() []byte {
        return e.Start
}

func (e *btreeFrontierEntry) EndKey() []byte {
        return e.End
}

func (e *btreeFrontierEntry) New() *btreeFrontierEntry {
        return &btreeFrontierEntry{}
}

func (e *btreeFrontierEntry) SetID(id uint64) {
        e.id = id
}

func (e *btreeFrontierEntry) SetKey(k []byte) {
        e.Start = k
}

func (e *btreeFrontierEntry) SetEndKey(k []byte) {
        e.End = k
}

func (e *btreeFrontierEntry) String() string {
        return fmt.Sprintf("[%s@%s]", e.span(), e.ts)
}

func (e *btreeFrontierEntry) span() roachpb.Span {
        return roachpb.Span{Key: e.Start, EndKey: e.End}
}

// isEmptyRange returns true if btreeFrontier entry range is empty.
func (e *btreeFrontierEntry) isEmptyRange() bool {
        return e.Start.Compare(e.End) >= 0
}

// frontierHeap implements heap.Interface and holds `btreeFrontierEntry`s. Entries
// are sorted based on their timestamp such that the oldest will rise to the top
// of the heap.
type frontierHeap []*btreeFrontierEntry

// Len implements heap.Interface.
func (h frontierHeap) Len() int { return len(h) }

// Less implements heap.Interface.
func (h frontierHeap) Less(i, j int) bool {
        if h[i].ts == h[j].ts {
                return h[i].Start.Compare(h[j].Start) < 0
        }
        return h[i].ts.Less(h[j].ts)
}

// Swap implements heap.Interface.
func (h frontierHeap) Swap(i, j int) {
        h[i], h[j] = h[j], h[i]
        h[i].heapIdx, h[j].heapIdx = i, j
}

// Push implements heap.Interface.
func (h *frontierHeap) Push(x interface{}) {
        n := len(*h)
        entry := x.(*btreeFrontierEntry)
        entry.heapIdx = n
        *h = append(*h, entry)
}

// Pop implements heap.Interface.
func (h *frontierHeap) Pop() interface{} {
        old := *h
        n := len(old)
        entry := old[n-1]
        entry.heapIdx = -1 // for safety
        old[n-1] = nil     // for gc
        *h = old[0 : n-1]
        return entry
}

// newFrontierEntry/putFrontierEntry provide access to pooled *btreeFrontierEntry.
var newFrontierEntry, putFrontierEntry = func() (
        func(id *uint64, start, end roachpb.Key, ts hlc.Timestamp) *btreeFrontierEntry,
        func(e *btreeFrontierEntry),
) {
        entryPool := sync.Pool{New: func() any { return new(btreeFrontierEntry) }}

        newEntry := func(idAlloc *uint64, start, end roachpb.Key, ts hlc.Timestamp) *btreeFrontierEntry {
                e := entryPool.Get().(*btreeFrontierEntry)
                var id uint64
                if idAlloc != nil {
                        id = *idAlloc
                        *idAlloc++
                }
                *e = btreeFrontierEntry{
                        Start:   start,
                        End:     end,
                        id:      id,
                        ts:      ts,
                        heapIdx: -1,
                }

                if expensiveChecksEnabled() {
                        e.spanCopy.Key = append(e.spanCopy.Key, start...)
                        e.spanCopy.EndKey = append(e.spanCopy.EndKey, end...)
                }

                return e
        }
        putEntry := func(e *btreeFrontierEntry) {
                e.Start = nil
                e.End = nil
                e.spanCopy.Key = nil
                e.spanCopy.EndKey = nil
                entryPool.Put(e)
        }
        return newEntry, putEntry
}()

// newSearchKey returns btreeFrontierEntry that can be used to search/seek
// in the btree.
var newSearchKey = func(start, end roachpb.Key) *btreeFrontierEntry {
        return newFrontierEntry(nil, start, end, hlc.Timestamp{})
}

// checkSpan validates span.
func checkSpan(s roachpb.Span) error {
        switch s.Key.Compare(s.EndKey) {
        case 1:
                return errors.Wrapf(interval.ErrInvertedRange, "inverted span %s", s)
        case 0:
                if len(s.Key) == 0 && len(s.EndKey) == 0 {
                        return errors.Wrapf(interval.ErrNilRange, "nil span %s", s)
                }
                return errors.Wrapf(interval.ErrEmptyRange, "empty span %s", s)
        default:
                return nil
        }
}

// checkUnsafeKeyModification is an expensive check performed under tests
// to verify that the caller did not mutate span keys after adding/forwarding them.
func (f *btreeFrontier) checkUnsafeKeyModification() error {
        it := f.tree.MakeIter()
        for it.First(); it.Valid(); it.Next() {
                cur := it.Cur()
                if !cur.Start.Equal(cur.spanCopy.Key) || !cur.End.Equal(cur.spanCopy.EndKey) {
                        return errors.Newf("unsafe span key modification: was %s, now %s", cur.spanCopy, cur.span())
                }
        }
        return nil
}

func (f *btreeFrontier) checkDisallowedMutation() error {
        if f.disallowMutationWhileIterating.Load() {
                err := errors.AssertionFailedWithDepthf(1, "attempt to mutate frontier while iterating")
                if buildutil.CrdbTestBuild {
                        panic(err)
                }
                return err
        }
        return nil
}

var disableSanityChecksForBenchmark bool

func expensiveChecksEnabled() bool {
        return buildutil.CrdbTestBuild && !disableSanityChecksForBenchmark
}

// spanDifference subtracts frontier (spans) from this span, and
// returns set difference.
func spanDifference(s roachpb.Span, f Frontier) []roachpb.Span {
        var sg roachpb.SpanGroup
        sg.Add(s)

        f.SpanEntries(s, func(overlap roachpb.Span, ts hlc.Timestamp) (done OpResult) {
                sg.Sub(overlap)
                return false
        })

        return sg.Slice()
}

type concurrentFrontier struct {
        syncutil.Mutex
        f Frontier
}

var _ Frontier = (*concurrentFrontier)(nil)

// AddSpansAt implements Frontier.
func (f *concurrentFrontier) AddSpansAt(startAt hlc.Timestamp, spans ...roachpb.Span) error {
        f.Lock()
        defer f.Unlock()
        return f.f.AddSpansAt(startAt, spans...)
}

// Frontier implements Frontier.
func (f *concurrentFrontier) Frontier() hlc.Timestamp {
        f.Lock()
        defer f.Unlock()
        return f.f.Frontier()
}

// PeekFrontierSpan implements Frontier.
func (f *concurrentFrontier) PeekFrontierSpan() roachpb.Span {
        f.Lock()
        defer f.Unlock()
        return f.f.PeekFrontierSpan()
}

// Forward implements Frontier.
func (f *concurrentFrontier) Forward(span roachpb.Span, ts hlc.Timestamp) (bool, error) {
        f.Lock()
        defer f.Unlock()
        return f.f.Forward(span, ts)
}

// Release implements Frontier.
func (f *concurrentFrontier) Release() {
        f.Lock()
        defer f.Unlock()
        f.f.Release()
}

// Entries implements Frontier.
func (f *concurrentFrontier) Entries(fn Operation) {
        f.Lock()
        defer f.Unlock()
        f.f.Entries(fn)
}

// SpanEntries implements Frontier.
func (f *concurrentFrontier) SpanEntries(span roachpb.Span, op Operation) {
        f.Lock()
        defer f.Unlock()
        f.f.SpanEntries(span, op)
}

// Len implements Frontier.
func (f *concurrentFrontier) Len() int {
        f.Lock()
        defer f.Unlock()
        return f.f.Len()
}

// String implements Frontier.
func (f *concurrentFrontier) String() string {
        f.Lock()
        defer f.Unlock()
        return f.f.String()
}

// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package span

import (
        "container/heap"
        "context"
        "fmt"
        "math/rand"
        "strings"
        "testing"

        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/testutils"
        "github.com/cockroachdb/cockroach/pkg/util/encoding"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/leaktest"
        "github.com/cockroachdb/cockroach/pkg/util/log"
        "github.com/cockroachdb/cockroach/pkg/util/randutil"
        "github.com/stretchr/testify/require"
)

func entriesStr(f Frontier) string {
        var buf strings.Builder
        f.Entries(func(sp roachpb.Span, ts hlc.Timestamp) OpResult {
                if buf.Len() != 0 {
                        buf.WriteString(` `)
                }
                fmt.Fprintf(&buf, `%s@%d`, sp, ts.WallTime)
                return ContinueMatch
        })
        return buf.String()
}

type frontierForwarder struct {
        t        *testing.T
        f        Frontier
        advanced bool
}

func (f frontierForwarder) expectAdvanced(expected bool) frontierForwarder {
        require.Equal(f.t, expected, f.advanced, entriesStr(f.f))
        return f
}
func (f frontierForwarder) expectFrontier(wall int64) frontierForwarder {
        require.Equal(f.t, hlc.Timestamp{WallTime: wall}, f.f.Frontier(),
                "expected %d, found %s", wall, f.f.Frontier())
        return f
}
func (f frontierForwarder) expectEntries(expected string) frontierForwarder {
        require.Equal(f.t, expected, entriesStr(f.f))
        return f
}

func makeFrontierForwarded(
        t *testing.T, f Frontier,
) func(s roachpb.Span, wall int64) frontierForwarder {
        t.Helper()
        return func(s roachpb.Span, wall int64) frontierForwarder {
                advanced, err := f.Forward(s, hlc.Timestamp{WallTime: wall})
                require.NoError(t, err)
                return frontierForwarder{
                        t:        t,
                        f:        f,
                        advanced: advanced,
                }
        }
}
func TestSpanFrontier(t *testing.T) {
        defer leaktest.AfterTest(t)()

        testutils.RunTrueAndFalse(t, "btree", func(t *testing.T, useBtreeFrontier bool) {
                defer EnableBtreeFrontier(useBtreeFrontier)()

                keyA, keyB := roachpb.Key("a"), roachpb.Key("b")
                keyC, keyD := roachpb.Key("c"), roachpb.Key("d")

                spAB := roachpb.Span{Key: keyA, EndKey: keyB}
                spAC := roachpb.Span{Key: keyA, EndKey: keyC}
                spAD := roachpb.Span{Key: keyA, EndKey: keyD}
                spBC := roachpb.Span{Key: keyB, EndKey: keyC}
                spBD := roachpb.Span{Key: keyB, EndKey: keyD}
                spCD := roachpb.Span{Key: keyC, EndKey: keyD}

                f, err := MakeFrontier(spAD)
                require.NoError(t, err)
                require.Equal(t, hlc.Timestamp{}, f.Frontier())
                require.Equal(t, `{a-d}@0`, entriesStr(f))

                forwardFrontier := makeFrontierForwarded(t, f)

                // Untracked spans are ignored
                forwardFrontier(roachpb.Span{Key: []byte("d"), EndKey: []byte("e")}, 1).
                        expectAdvanced(false).
                        expectFrontier(0).
                        expectEntries(`{a-d}@0`)

                // Forward the entire tracked spanspace.
                forwardFrontier(spAD, 1).
                        expectAdvanced(true).
                        expectFrontier(1).
                        expectEntries(`{a-d}@1`)

                // Forward it again.
                forwardFrontier(spAD, 2).
                        expectAdvanced(true).
                        expectFrontier(2).
                        expectEntries(`{a-d}@2`)

                // Forward to the previous frontier.
                forwardFrontier(spAD, 2).
                        expectAdvanced(false).
                        expectFrontier(2).
                        expectEntries(`{a-d}@2`)

                // Forward into the past is ignored.
                forwardFrontier(spAD, 1).
                        expectAdvanced(false).
                        expectFrontier(2).
                        expectEntries(`{a-d}@2`)

                // Forward a subset.
                forwardFrontier(spBC, 3).
                        expectAdvanced(false).
                        expectFrontier(2).
                        expectEntries(`{a-b}@2 {b-c}@3 {c-d}@2`)

                // Forward it more.
                forwardFrontier(spBC, 4).
                        expectAdvanced(false).
                        expectFrontier(2).
                        expectEntries(`{a-b}@2 {b-c}@4 {c-d}@2`)

                // Forward all tracked spans to timestamp before BC (currently at 4).
                // Advances to the min of tracked spans. Note that this requires the
                // forwarded span to be split into two spans, one on each side of BC.
                forwardFrontier(spAD, 3).
                        expectAdvanced(true).
                        expectFrontier(3).
                        expectEntries(`{a-b}@3 {b-c}@4 {c-d}@3`)

                // Forward everything but BC, advances to the min of tracked spans.
                forwardFrontier(spAB, 5).
                        expectAdvanced(false).
                        expectFrontier(3)

                forwardFrontier(spCD, 5).
                        expectAdvanced(true).
                        expectFrontier(4).
                        expectEntries(`{a-b}@5 {b-c}@4 {c-d}@5`)

                // Catch BC up: spans collapse.
                forwardFrontier(spBC, 5).
                        expectAdvanced(true).
                        expectFrontier(5).
                        expectEntries(`{a-d}@5`)

                // Forward them all at once.
                forwardFrontier(spAD, 6).
                        expectAdvanced(true).
                        expectFrontier(6).
                        expectEntries(`{a-d}@6`)

                // Split AC with BD.
                forwardFrontier(spCD, 7).
                        expectAdvanced(false).
                        expectFrontier(6).
                        expectEntries(`{a-c}@6 {c-d}@7`)

                forwardFrontier(spBD, 8).
                        expectAdvanced(false).
                        expectFrontier(6).
                        expectEntries(`{a-b}@6 {b-d}@8`)

                forwardFrontier(spAB, 8).
                        expectAdvanced(true).
                        expectFrontier(8).
                        expectEntries(`{a-d}@8`)

                // Split BD with AC.
                forwardFrontier(spAC, 9).
                        expectAdvanced(false).
                        expectFrontier(8).
                        expectEntries(`{a-c}@9 {c-d}@8`)

                forwardFrontier(spCD, 9).
                        expectAdvanced(true).
                        expectFrontier(9).
                        expectEntries(`{a-d}@9`)
        })
}

func TestSpanFrontierDisjointSpans(t *testing.T) {
        defer leaktest.AfterTest(t)()

        testutils.RunTrueAndFalse(t, "btree", func(t *testing.T, useBtreeFrontier bool) {
                defer EnableBtreeFrontier(useBtreeFrontier)()

                keyA, keyB, keyC := roachpb.Key("a"), roachpb.Key("b"), roachpb.Key("c")
                keyD, keyE, keyF := roachpb.Key("d"), roachpb.Key("e"), roachpb.Key("f")
                spAB := roachpb.Span{Key: keyA, EndKey: keyB}
                spAD := roachpb.Span{Key: keyA, EndKey: keyD}
                spCE := roachpb.Span{Key: keyC, EndKey: keyE}
                spDF := roachpb.Span{Key: keyD, EndKey: keyF}

                f, err := MakeFrontier(spAB, spCE)
                require.NoError(t, err)
                require.Equal(t, hlc.Timestamp{}, f.Frontier())
                require.Equal(t, `{a-b}@0 {c-e}@0`, entriesStr(f))

                forwardFrontier := makeFrontierForwarded(t, f)

                // Advance just the tracked spans
                forwardFrontier(spCE, 1).
                        expectAdvanced(false).
                        expectFrontier(0).
                        expectEntries(`{a-b}@0 {c-e}@1`)

                forwardFrontier(spAB, 1).
                        expectAdvanced(true).
                        expectFrontier(1).
                        expectEntries(`{a-b}@1 {c-e}@1`)

                // Advance a span that partially overlaps the tracked spans
                forwardFrontier(spDF, 2).
                        expectAdvanced(false).
                        expectFrontier(1).
                        expectEntries(`{a-b}@1 {c-d}@1 {d-e}@2`)

                // Advance one span that covers two tracked spans and so needs two entries.
                forwardFrontier(spAD, 3).
                        expectAdvanced(true).
                        expectFrontier(2).
                        expectEntries(`{a-b}@3 {c-d}@3 {d-e}@2`)

                // Advance span that overlaps all the spans tracked by this frontier.
                // {c-d} and {d-e} should collapse.
                forwardFrontier(roachpb.Span{Key: roachpb.Key(`0`), EndKey: roachpb.Key(`q`)}, 4).
                        expectAdvanced(true).
                        expectFrontier(4).
                        expectEntries(`{a-b}@4 {c-e}@4`)
        })
}

func TestSpanFrontierHeap(t *testing.T) {
        defer leaktest.AfterTest(t)()

        keyA, keyB, keyC := roachpb.Key("a"), roachpb.Key("b"), roachpb.Key("c")
        spAB := roachpb.Span{Key: keyA, EndKey: keyB}
        spBC := roachpb.Span{Key: keyB, EndKey: keyC}

        var fh frontierHeap

        mkFrontierEntry := func(s roachpb.Span, wall int64) *btreeFrontierEntry {
                e := &btreeFrontierEntry{Start: s.Key, End: s.EndKey, ts: hlc.Timestamp{WallTime: wall}}
                return e
        }
        eAB1 := mkFrontierEntry(spAB, 1)
        eBC1 := mkFrontierEntry(spBC, 1)
        eAB2 := mkFrontierEntry(spAB, 2)

        // Push one
        heap.Push(&fh, eAB1)
        require.Equal(t, eAB1, heap.Pop(&fh))

        // Push different spans and times
        heap.Push(&fh, eAB1)
        heap.Push(&fh, eBC1)
        heap.Push(&fh, eAB2)
        require.Equal(t, eAB1, heap.Pop(&fh))
        require.Equal(t, eBC1, heap.Pop(&fh))
        require.Equal(t, eAB2, heap.Pop(&fh))

        // Push in a different span order
        heap.Push(&fh, eBC1)
        heap.Push(&fh, eAB1)
        heap.Push(&fh, eAB2)
        require.Equal(t, eAB1, heap.Pop(&fh))
        require.Equal(t, eBC1, heap.Pop(&fh))
        require.Equal(t, eAB2, heap.Pop(&fh))

        // Push in a different time order
        heap.Push(&fh, eAB2)
        heap.Push(&fh, eAB1)
        heap.Push(&fh, eBC1)
        require.Equal(t, eAB1, heap.Pop(&fh))
        require.Equal(t, eBC1, heap.Pop(&fh))
        require.Equal(t, eAB2, heap.Pop(&fh))
}

func TestSequentialSpans(t *testing.T) {
        defer leaktest.AfterTest(t)()

        testutils.RunTrueAndFalse(t, "btree", func(t *testing.T, useBtreeFrontier bool) {
                defer EnableBtreeFrontier(useBtreeFrontier)()

                f, err := MakeFrontier(roachpb.Span{Key: roachpb.Key("A"), EndKey: roachpb.Key("Z")})
                require.NoError(t, err)

                var expectedRanges []string
                for r := 'A'; r <= 'Z'-1; r++ {
                        var sp roachpb.Span
                        sp.Key = append(sp.Key, byte(r))
                        sp.EndKey = append(sp.EndKey, byte(r+1))
                        wall := r - 'A' + 1
                        _, err := f.Forward(sp, hlc.Timestamp{WallTime: int64(wall)})
                        require.NoError(t, err)
                        expectedRanges = append(expectedRanges, fmt.Sprintf("%s@%d", sp, wall))
                }
                require.Equal(t, strings.Join(expectedRanges, " "), entriesStr(f))
        })
}

func makeSpan(start, end string) roachpb.Span {
        return roachpb.Span{Key: roachpb.Key(start), EndKey: roachpb.Key(end)}
}

func TestSpanEntries(t *testing.T) {
        defer leaktest.AfterTest(t)()

        advance := func(f Frontier, s roachpb.Span, wall int64) {
                _, err := f.Forward(s, hlc.Timestamp{WallTime: wall})
                require.NoError(t, err)
        }

        spanEntries := func(f Frontier, sp roachpb.Span) string {
                var buf strings.Builder
                f.SpanEntries(sp, func(s roachpb.Span, ts hlc.Timestamp) OpResult {
                        if buf.Len() != 0 {
                                buf.WriteString(` `)
                        }
                        fmt.Fprintf(&buf, `%s@%d`, s, ts.WallTime)
                        return ContinueMatch
                })
                return buf.String()
        }

        testutils.RunTrueAndFalse(t, "btree", func(t *testing.T, useBtreeFrontier bool) {
                defer EnableBtreeFrontier(useBtreeFrontier)()

                t.Run("contiguous frontier", func(t *testing.T) {
                        spAZ := makeSpan("A", "Z")
                        f, err := MakeFrontier(spAZ)
                        require.NoError(t, err)
                        // Nothing overlaps span fully to the left of frontier.
                        require.Equal(t, ``, spanEntries(f, makeSpan("0", "9")))
                        // Nothing overlaps span fully to the right of the frontier.
                        require.Equal(t, ``, spanEntries(f, makeSpan("a", "z")))

                        // Span overlaps entire frontier.
                        require.Equal(t, `{A-Z}@0`, spanEntries(f, spAZ))
                        advance(f, spAZ, 1)
                        require.Equal(t, `{A-Z}@1`, spanEntries(f, spAZ))

                        // Span overlaps part of the frontier, with left part outside frontier.
                        require.Equal(t, `{A-C}@1`, spanEntries(f, makeSpan("0", "C")))

                        // Span overlaps part of the frontier, with right part outside frontier.
                        require.Equal(t, `{Q-Z}@1`, spanEntries(f, makeSpan("Q", "c")))

                        // Span fully inside frontier.
                        require.Equal(t, `{P-W}@1`, spanEntries(f, makeSpan("P", "W")))

                        // Advance part of the frontier.
                        advance(f, makeSpan("C", "E"), 2)
                        advance(f, makeSpan("H", "M"), 5)
                        advance(f, makeSpan("N", "Q"), 3)

                        // Span overlaps various parts of the frontier.
                        require.Equal(t,
                                `{A-C}@1 {C-E}@2 {E-H}@1 {H-M}@5 {M-N}@1 {N-P}@3`,
                                spanEntries(f, makeSpan("3", "P")))
                })

                t.Run("disjoint frontier", func(t *testing.T) {
                        spAB := makeSpan("A", "B")
                        spCE := makeSpan("C", "E")
                        f, err := MakeFrontier(spAB, spCE)
                        require.NoError(t, err)

                        // Nothing overlaps between the two spans in the frontier.
                        require.Equal(t, ``, spanEntries(f, makeSpan("B", "C")))

                        // Overlap with only one entry in the frontier
                        require.Equal(t, `{C-D}@0`, spanEntries(f, makeSpan("B", "D")))
                })
        })
}

func advanceFrontier(t *testing.T, f Frontier, s roachpb.Span, wall int64) {
        t.Helper()
        if log.V(1) {
                defer func(before string) {
                        log.Infof(context.Background(), "advanceFrontier %s@%d: before=%s after=%s", s, wall, before, f)
                }(f.String())
        }

        require.NoError(t, forwardWithErrorCheck(f, s, wall))
        _, _, err := checkContiguousFrontier(f)
        require.NoError(t, err)
}

// TestForwardInvertedSpan is a replay of a failure uncovered by fuzzLLRBFrontier test.
// It verifies frontier behaves as expected when attempting to forward inverted span.
func TestForwardInvertedSpan(t *testing.T) {
        defer leaktest.AfterTest(t)()

        spAZ := makeSpan("A", "Z")
        testutils.RunTrueAndFalse(t, "btree", func(t *testing.T, useBtreeFrontier bool) {
                defer EnableBtreeFrontier(useBtreeFrontier)()

                f, err := MakeFrontier(spAZ)
                require.NoError(t, err)

                advanceFrontier(t, f, makeSpan("AUgymc", "OOyXp"), 1831)
                advanceFrontier(t, f, makeSpan("AUgymc", "OOyXp"), 1923)
                advanceFrontier(t, f, makeSpan("AUgymc", "OOyXp"), 2009)
                advanceFrontier(t, f, makeSpan("AUggymcymc", "OOyXp"), 2009)
                advanceFrontier(t, f, makeSpan("pOyXOmcymc", "pOyXO"), 2009) // NB: inverted span.
                advanceFrontier(t, f, makeSpan("a94", "a948"), 1865)
                advanceFrontier(t, f, makeSpan("a94", "a948"), 1865)
                advanceFrontier(t, f, makeSpan("03hO2Z", "RJRxCy"), 1864)
                advanceFrontier(t, f, makeSpan("03hO2Z", "RJRxCy"), 1864)
                advanceFrontier(t, f, makeSpan("03", "RJRxCy"), 1864)
                advanceFrontier(t, f, makeSpan("0", "RJRxCy"), 1864)
                advanceFrontier(t, f, makeSpan("0", "RJRxCy"), 1864)
                advanceFrontier(t, f, makeSpan("0", "RJRxCy"), 1864)
                advanceFrontier(t, f, makeSpan("0", "RJ"), 1864)
                advanceFrontier(t, f, makeSpan("0", "R"), 1864)
                advanceFrontier(t, f, makeSpan("0", "0"), 1864)
        })
}

func TestForwardToSameTimestamp(t *testing.T) {
        defer EnableBtreeFrontier(true)() // LLRB frontier fails this test
        spAZ := makeSpan("A", "Z")

        f, err := MakeFrontier(spAZ)
        require.NoError(t, err)
        advanceFrontier(t, f, makeSpan("Axj", "L"), 0)
        // Frontier should remain the same since we forwarded a subspan
        // to the same timestamp.
        require.Equal(t, "[{A-Z}@0,0]", f.String())
}

func TestFrontierImplementationsMatch(t *testing.T) {
        rng, seed := randutil.NewPseudoRand()
        t.Logf("seed: %d", seed)

        mkSpan := func(key, end int) roachpb.Span {
                return roachpb.Span{
                        Key:    encoding.EncodeVarintAscending(nil, int64(key)),
                        EndKey: encoding.EncodeVarintAscending(nil, int64(end)),
                }
        }

        start, total := 100, 1000
        totalSpan := mkSpan(start, start+total)

        for run := 1; run <= 10; run++ {
                l := newLLRBFrontier()
                b := newBtreeFrontier()
                require.NoError(t, l.AddSpansAt(hlc.Timestamp{}, totalSpan))
                require.NoError(t, b.AddSpansAt(hlc.Timestamp{}, totalSpan))

                for i := 0; i < 100000; i++ {
                        k := start + rng.Intn(total)
                        sp := mkSpan(k, k+1+rng.Intn(3))
                        ts := hlc.Timestamp{WallTime: int64(rng.Intn(20 * run))}

                        lFwd, err := l.Forward(sp, ts)
                        if err != nil {
                                t.Fatal(err)
                        }
                        bFwd, err := b.Forward(sp, ts)
                        if err != nil {
                                t.Fatal(err)
                        }

                        if lFwd != bFwd {
                                t.Fatalf("%v != %v (run %d, i %d)", lFwd, bFwd, run, i)
                        }
                        if lF, bF := l.Frontier(), b.Frontier(); lF != bF {
                                t.Fatalf("%v != %v (run %d, i %d)", lF, bF, run, i)
                        }
                }
                t.Logf("%s vs %s", l.Frontier(), b.Frontier())
        }
}

func TestAddOverlappingSpans(t *testing.T) {
        defer leaktest.AfterTest(t)()

        ts := func(wall int64) hlc.Timestamp {
                return hlc.Timestamp{WallTime: wall}
        }

        testutils.RunTrueAndFalse(t, "btree", func(t *testing.T, useBtreeFrontier bool) {
                defer EnableBtreeFrontier(useBtreeFrontier)()

                f, err := MakeFrontier()
                require.NoError(t, err)

                for r := 'A'; r < 'Z'; r++ {
                        require.NoError(t, f.AddSpansAt(ts(int64(r-'A'+1)), makeSpan(string(r), string(r+'a'-'A'))))
                }
                require.NoError(t, f.AddSpansAt(ts(42), makeSpan("A", "z")))
                require.Equal(t, hlc.Timestamp{WallTime: 42}, f.Frontier(), "f=%s", f)
        })
}

func TestBtreeFrontierMergesSpansDuringInitialization(t *testing.T) {
        ts := func(wall int64) hlc.Timestamp {
                return hlc.Timestamp{WallTime: wall}
        }

        testutils.RunTrueAndFalse(t, "btree", func(t *testing.T, useBtreeFrontier bool) {
                defer EnableBtreeFrontier(useBtreeFrontier)()

                f, err := MakeFrontier()
                require.NoError(t, err)

                require.NoError(t, f.AddSpansAt(ts(8), makeSpan("A", "C")))
                require.NoError(t, f.AddSpansAt(ts(10), makeSpan("B", "D")))
                require.NoError(t, f.AddSpansAt(ts(9), makeSpan("C", "Z")))
                start, end, err := checkContiguousFrontier(f)
                require.NoError(t, err)
                require.Equal(t, []byte{'A'}, start, f)
                require.Equal(t, []byte{'Z'}, end, f)
                require.Equal(t, "{A-B}@8 {B-D}@10 {D-Z}@9", entriesStr(f))
        })
}

// Regression for #115411
func TestForwardDeepNestedFrontierEntry(t *testing.T) {
        defer leaktest.AfterTest(t)()

        ts := func(wall int) hlc.Timestamp {
                return hlc.Timestamp{WallTime: int64(wall)}
        }

        testutils.RunTrueAndFalse(t, "btree", func(t *testing.T, useBtreeFrontier bool) {
                defer EnableBtreeFrontier(useBtreeFrontier)()
                f, err := MakeFrontier()
                require.NoError(t, err)

                require.NoError(t, f.AddSpansAt(ts(10), makeSpan("B", "C")))

                // Add a bunch of ranges inside [B-C) range.
                // We want to add more than 32 of such ranges to make sure that
                // the underlying b-tree node (if using btree frontier) gets some "children"
                // nodes created.
                bStart := "B"
                for i := 0; i < 64; i++ {
                        bEnd := "B" + strings.Repeat("b", i+1)
                        require.NoError(t, f.AddSpansAt(ts(i+10), makeSpan(bStart, bEnd)))
                        _, _, err := checkContiguousFrontier(f)
                        require.NoError(t, err, f)
                        bStart = bEnd
                }

                advanceFrontier(t, f, makeSpan("A", "Z"), 100)
                require.Equal(t, "{B-C}@100", entriesStr(f))
        })
}

func BenchmarkFrontier(b *testing.B) {
        disableSanityChecksForBenchmark = true
        defer func() {
                disableSanityChecksForBenchmark = false
        }()

        b.StopTimer()
        // To produce repeatable runs, run benchmark with COCKROACH_RANDOM_SEED env to override
        // the seed value and -test.benchtime=Nx to set explicit number of iterations.
        rnd, rndSeed := randutil.NewPseudoRand()
        spanMaker, initialSpan := newSpanMaker(4, rnd)
        const corpusSize = 2 << 14
        corpus := make([]roachpb.Span, corpusSize)
        for i := 0; i < corpusSize; i++ {
                corpus[i] = spanMaker.rndSpan()
        }
        b.StartTimer()

        benchForward := func(b *testing.B, f Frontier, rnd *rand.Rand) {
                if log.V(1) {
                        log.Infof(context.Background(), "N=%d NumEntries=%d (seed: %d)", b.N, f.Len(), rndSeed)
                }
                var wall int64 = 10

                for i := 0; i < b.N; i++ {
                        if i%10 == 0 {
                                wall += rnd.Int63n(10)
                        }

                        delta := rnd.Int63n(10) - rnd.Int63n(3)
                        rndSpan := corpus[rnd.Intn(corpusSize)]
                        if _, err := f.Forward(rndSpan, hlc.Timestamp{WallTime: wall + delta}); err != nil {
                                b.Fatalf("%+v", err)
                        }
                }
                if log.V(1) {
                        log.Infof(context.Background(), "Final frontier has %d entries", f.Len())
                }
        }

        for _, enableBtree := range []bool{false, true} {
                b.Run(fmt.Sprintf("btree=%t/rnd", enableBtree), func(b *testing.B) {
                        defer EnableBtreeFrontier(enableBtree)()

                        b.StopTimer()
                        // Reset rnd so that we get the same inputs for both benchmarks.
                        rnd := rand.New(rand.NewSource(rndSeed))
                        f, err := MakeFrontier(initialSpan)
                        if err != nil {
                                b.Fatal(err)
                        }
                        b.StartTimer()
                        b.ReportAllocs()

                        benchForward(b, f, rnd)
                })

                // Bench a case where frontier tracks multiple disjoint spans.
                for _, numRanges := range []int{128, 1024, 4096, 8192, 16384} {
                        b.Run(fmt.Sprintf("btree=%t/r=%d", enableBtree, numRanges), func(b *testing.B) {
                                defer EnableBtreeFrontier(enableBtree)()

                                b.StopTimer()
                                // Reset rnd so that we get the same inputs for both benchmarks.
                                rnd := rand.New(rand.NewSource(rndSeed))
                                f, err := MakeFrontier()
                                if err != nil {
                                        b.Fatal(err)
                                }
                                var sg roachpb.SpanGroup
                                for sg.Len() < numRanges {
                                        startKey := roachpb.Key(spanMaker.rndKey())
                                        sg.Add(roachpb.Span{Key: startKey, EndKey: startKey.Next()})
                                }

                                if err := sg.ForEach(func(span roachpb.Span) error {
                                        return f.AddSpansAt(hlc.Timestamp{}, span)
                                }); err != nil {
                                        b.Fatal(err)
                                }

                                b.StartTimer()
                                b.ReportAllocs()

                                benchForward(b, f, rnd)
                        })
                }
        }
}

// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package span

import (
        "container/heap"
        "fmt"
        "strings"

        // Needed for roachpb.Span.String().
        _ "github.com/cockroachdb/cockroach/pkg/keys"
        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/interval"
)

// llrbFrontier is a legacy span btreeFrontier implementation using LLRB tree.

// llrbFrontierEntry represents a timestamped span. It is used as the nodes in both
// the interval tree and heap needed to keep the llrbFrontier.
type llrbFrontierEntry struct {
        id   int64
        keys interval.Range
        span roachpb.Span
        ts   hlc.Timestamp
        // The index of the item in the llrbFrontierHeap, maintained by the
        // heap.Interface methods.
        index int
}

// ID implements interval.Interface.
func (s *llrbFrontierEntry) ID() uintptr {
        return uintptr(s.id)
}

// Range implements interval.Interface.
func (s *llrbFrontierEntry) Range() interval.Range {
        return s.keys
}

func (s *llrbFrontierEntry) String() string {
        return fmt.Sprintf("[%s @ %s]", s.span, s.ts)
}

// llrbFrontierHeap implements heap.Interface and holds `llrbFrontierEntry`s. Entries
// are sorted based on their timestamp such that the oldest will rise to the top
// of the heap.
type llrbFrontierHeap []*llrbFrontierEntry

// Len implements heap.Interface.
func (h llrbFrontierHeap) Len() int { return len(h) }

// Less implements heap.Interface.
func (h llrbFrontierHeap) Less(i, j int) bool {
        if h[i].ts == h[j].ts {
                return h[i].span.Key.Compare(h[j].span.Key) < 0
        }
        return h[i].ts.Less(h[j].ts)
}

// Swap implements heap.Interface.
func (h llrbFrontierHeap) Swap(i, j int) {
        h[i], h[j] = h[j], h[i]
        h[i].index, h[j].index = i, j
}

// Push implements heap.Interface.
func (h *llrbFrontierHeap) Push(x interface{}) {
        n := len(*h)
        entry := x.(*llrbFrontierEntry)
        entry.index = n
        *h = append(*h, entry)
}

// Pop implements heap.Interface.
func (h *llrbFrontierHeap) Pop() interface{} {
        old := *h
        n := len(old)
        entry := old[n-1]
        entry.index = -1 // for safety
        old[n-1] = nil   // for gc
        *h = old[0 : n-1]
        return entry
}

// llrbFrontier tracks the minimum timestamp of a set of spans.
type llrbFrontier struct {
        // tree contains `*llrbFrontierEntry` items for the entire current tracked
        // span set. Any tracked spans that have never been `Forward`ed will have a
        // zero timestamp. If any entries needed to be split along a tracking
        // boundary, this has already been done by `insert` before it entered the
        // tree.
        tree interval.Tree
        // minHeap contains the same `*llrbFrontierEntry` items as `tree`. Entries
        // in the heap are sorted first by minimum timestamp and then by lesser
        // start key.
        minHeap llrbFrontierHeap

        idAlloc int64
}

// copyRangeToSpan copies intervals start/end points and returns a span.
// Whenever we store user provided span objects inside btreeFrontier
// datastructures, we must make a copy lest the user later mutates
// underlying start/end []byte slices in the range.
func copyRangeToSpan(r interval.Range) (res roachpb.Span) {
        res.Key = append(res.Key, r.Start...)
        res.EndKey = append(res.EndKey, r.End...)
        return
}

// Release implements Frontier interface.
func (f *llrbFrontier) Release() {}

// AddSpansAt adds the provided spans to the llrbFrontier at the provided timestamp.
func (f *llrbFrontier) AddSpansAt(startAt hlc.Timestamp, spans ...roachpb.Span) error {

        for _, toAdd := range spans {
                toAdd = copyRangeToSpan(toAdd.AsRange())

                // Add toAdd sub-spans that do not overlap this frontier. To ensure that
                // adjacent spans are merged, sub-spans are added in two steps: first,
                // non-overlapping spans are added with 0 timestamp; then the timestamp for
                // the entire toAdd span is forwarded.
                for _, span := range spanDifference(toAdd, f) {
                        e := &llrbFrontierEntry{
                                id:   f.idAlloc,
                                keys: span.AsRange(),
                                span: span,
                                ts:   hlc.Timestamp{},
                        }
                        f.idAlloc++
                        if err := f.tree.Insert(e, false /* fast */); err != nil {
                                return err
                        }
                        heap.Push(&f.minHeap, e)
                }
                if err := f.insert(toAdd, startAt); err != nil {
                        return err
                }
        }
        f.tree.AdjustRanges()
        return nil
}

// Frontier returns the minimum timestamp being tracked.
func (f *llrbFrontier) Frontier() hlc.Timestamp {
        if f.minHeap.Len() == 0 {
                return hlc.Timestamp{}
        }
        return f.minHeap[0].ts
}

// PeekFrontierSpan returns one of the spans at the llrbFrontier.
func (f *llrbFrontier) PeekFrontierSpan() roachpb.Span {
        if f.minHeap.Len() == 0 {
                return roachpb.Span{}
        }
        return f.minHeap[0].span
}

// Forward advances the timestamp for a span. Any part of the span that doesn't
// overlap the tracked span set will be ignored. True is returned if the
// frontier advanced as a result.
//
// Note that internally, it may be necessary to use multiple entries to
// represent this timestamped span (e.g. if it overlaps with the tracked span
// set boundary). Similarly, an entry created by a previous Forward may be
// partially overlapped and have to be split into two entries.
func (f *llrbFrontier) Forward(span roachpb.Span, ts hlc.Timestamp) (bool, error) {
        prevFrontier := f.Frontier()
        if err := f.insert(span, ts); err != nil {
                return false, err
        }
        return prevFrontier.Less(f.Frontier()), nil
}

// extendRangeToTheLeft extends the range to the left of the range, provided those
// ranges all have specified timestamp.
// Updates provided range with the new starting position.
// Returns the list of llrbFrontierEntries covered by the updated range; the caller
// is expected to remove those covered ranges from the tree.
func extendRangeToTheLeft(
        t interval.Tree, r *interval.Range, ts hlc.Timestamp,
) (covered []*llrbFrontierEntry) {
        for {
                // Get the range to the left of the range.
                // Since we request an inclusive overlap of the range containing exactly
                // 1 key, we expect to get two extensions if there is anything to the left:
                // the range (r) itself, and the one to the left of r.
                left := t.GetWithOverlapper(
                        interval.Range{Start: r.Start, End: r.Start},
                        interval.InclusiveOverlapper,
                )
                if len(left) == 2 && left[0].(*llrbFrontierEntry).ts.Equal(ts) {
                        e := left[0].(*llrbFrontierEntry)
                        covered = append(covered, e)
                        r.Start = e.keys.Start
                } else {
                        return
                }
        }
}

// extendRangeToTheRight extends the range to the right of the range, provided those
// ranges all have specified timestamp.
// Updates provided range with the new ending position.
// Returns the list of llrbFrontierEntries covered by the updated range; the caller
// is expected to remove those covered ranges from the tree.
func extendRangeToTheRight(
        t interval.Tree, r *interval.Range, ts hlc.Timestamp,
) (covered []*llrbFrontierEntry) {
        for {
                // Get the range to the right of the range.
                // Since we request an exclusive overlap of the range containing exactly
                // 1 key, we expect to get exactly 1 extensions if there is anything to the right of the span.
                endKey := roachpb.Key(r.End)
                rightSpan := roachpb.Span{Key: endKey, EndKey: endKey.Next()}
                right := t.GetWithOverlapper(rightSpan.AsRange(), interval.ExclusiveOverlapper)
                if len(right) == 1 && right[0].(*llrbFrontierEntry).ts.Equal(ts) {
                        e := right[0].(*llrbFrontierEntry)
                        covered = append(covered, e)
                        r.End = e.keys.End
                } else {
                        return
                }
        }
}

func (f *llrbFrontier) insert(span roachpb.Span, insertTS hlc.Timestamp) error {
        if err := checkSpan(span); err != nil {
                return err
        }

        // Set of llrbFrontierEntries to add and remove.
        var toAdd, toRemove []*llrbFrontierEntry

        // addEntry adds llrbFrontierEntry to the toAdd list.
        addEntry := func(r interval.Range, ts hlc.Timestamp) {
                sp := copyRangeToSpan(r)
                toAdd = append(toAdd, &llrbFrontierEntry{
                        id:   f.idAlloc,
                        span: sp,
                        keys: sp.AsRange(),
                        ts:   ts,
                })
                f.idAlloc++
        }

        // todoRange is the range we're adding. It gets updated as we process the range.
        todoRange := span.AsRange()

        // pendingSpan (if not empty) is the span of multiple overlap intervals
        // we'll merge together (because all of those intervals have timestamp lower
        // than insertTS).
        var pendingSpan interval.Range

        // consumePrefix consumes todoRange prefix ending at 'end' and moves
        // that prefix into pendingSpan.
        consumePrefix := func(end interval.Comparable) {
                if pendingSpan.Start == nil {
                        pendingSpan.Start = todoRange.Start
                }
                todoRange.Start = end
                pendingSpan.End = end
        }

        extendLeft := true // can the merged span be extended to the left?

        // addPending adds llrbFrontierEntry for the pendingSpan if it's non-empty, and resets it.
        addPending := func() {
                if !pendingSpan.Start.Equal(pendingSpan.End) {
                        if extendLeft {
                                toRemove = append(toRemove, extendRangeToTheLeft(f.tree, &pendingSpan, insertTS)...)
                        }
                        addEntry(pendingSpan, insertTS)
                }

                pendingSpan.Start = nil
                pendingSpan.End = nil
                extendLeft = true
        }

        // Main work: start iterating through all ranges that overlap our span.
        f.tree.DoMatching(func(k interval.Interface) (done bool) {
                overlap := k.(*llrbFrontierEntry)

                // If overlap does not start immediately after our pendingSpan,
                // then add and reset pending.
                if !overlap.span.Key.Equal(roachpb.Key(pendingSpan.End)) {
                        addPending()
                }

                // Trim todoRange if it falls outside the span(s) tracked by this frontier.
                // This establishes the invariant that overlap start must be at or before todoRange start.
                if todoRange.Start.Compare(overlap.keys.Start) < 0 {
                        todoRange.Start = overlap.keys.Start
                }

                // Fast case: we already recorded higher timestamp for this overlap
                if insertTS.Less(overlap.ts) {
                        todoRange.Start = overlap.keys.End
                        return ContinueMatch.asBool()
                }

                // At this point, we know that overlap timestamp is not ahead of the insertTS
                // (otherwise we'd hit fast case above).
                // We need split overlap range, so mark overlap for removal.
                toRemove = append(toRemove, overlap)

                // We need to split overlap range into multiple parts.
                // 1. Possibly empty part before todoRange.Start
                if overlap.keys.Start.Compare(todoRange.Start) < 0 {
                        extendLeft = false
                        addEntry(interval.Range{Start: overlap.keys.Start, End: todoRange.Start}, overlap.ts)
                }

                // 2. Middle part (with updated timestamp), and...
                // 3. Possibly empty part after todoRange end.
                if cmp := todoRange.End.Compare(overlap.keys.End); cmp <= 0 {
                        // Our todoRange ends before the overlap ends, so consume all of it.
                        consumePrefix(todoRange.End)

                        if cmp < 0 && overlap.ts != insertTS {
                                // Add the rest of the overlap.
                                addEntry(interval.Range{Start: todoRange.End, End: overlap.keys.End}, overlap.ts)
                        } else {
                                // We can consume all the way until the end of the overlap
                                // since overlap extends to the end of todoRange or it has the same timestamp as insertTS.
                                consumePrefix(overlap.keys.End)
                                // We can also attempt to merge more ranges with the same timestamp to the right
                                // of overlap.  Extending range to the right adjusts pendingSpan.End and returns the
                                // list of extended ranges, which we remove because they are subsumed by pendingSpan.
                                // Note also, that at this point, we know that this is the last overlap entry, and that
                                // we will exit DoMatching, at which point we add whatever range was accumulated
                                // in the pendingRange.
                                toRemove = append(toRemove, extendRangeToTheRight(f.tree, &pendingSpan, insertTS)...)
                        }
                } else {
                        // Our todoRange extends beyond overlap: consume until the end of the overlap.
                        consumePrefix(overlap.keys.End)
                }

                return ContinueMatch.asBool()
        }, span.AsRange())

        // Add remaining pending range.
        addPending()

        const withRangeAdjust = false
        for _, e := range toRemove {
                if err := f.tree.Delete(e, withRangeAdjust); err != nil {
                        return err
                }
                heap.Remove(&f.minHeap, e.index)
        }

        for _, e := range toAdd {
                if err := f.tree.Insert(e, withRangeAdjust); err != nil {
                        return err
                }
                heap.Push(&f.minHeap, e)
        }
        return nil
}

// Entries invokes the given callback with the current timestamp for each
// component span in the tracked span set.
func (f *llrbFrontier) Entries(fn Operation) {
        f.tree.Do(func(i interval.Interface) bool {
                spe := i.(*llrbFrontierEntry)
                return fn(spe.span, spe.ts).asBool()
        })
}

// SpanEntries invokes op for each sub-span of the specified span with the
// timestamp as observed by this frontier.
//
// Time
// 5|      .b__c               .
// 4|      .             h__k  .
// 3|      .      e__f         .
// 1 ---a----------------------m---q-- llrbFrontier
//
//        |___________span___________|
//
// In the above example, frontier tracks [b, m) and the current frontier
// timestamp is 1.  SpanEntries for span [a-q) will invoke op with:
//
//        ([b-c), 5), ([c-e), 1), ([e-f), 3], ([f, h], 1) ([h, k), 4), ([k, m), 1).
//
// Note: neither [a-b) nor [m, q) will be emitted since they do not intersect with the spans
// tracked by this frontier.
func (f *llrbFrontier) SpanEntries(span roachpb.Span, op Operation) {
        todoRange := span.AsRange()

        f.tree.DoMatching(func(i interval.Interface) bool {
                e := i.(*llrbFrontierEntry)

                // Skip untracked portion.
                if todoRange.Start.Compare(e.keys.Start) < 0 {
                        todoRange.Start = e.keys.Start
                }

                end := e.keys.End
                if e.keys.End.Compare(todoRange.End) > 0 {
                        end = todoRange.End
                }

                if op(roachpb.Span{Key: roachpb.Key(todoRange.Start), EndKey: roachpb.Key(end)}, e.ts) == StopMatch {
                        return StopMatch.asBool()
                }
                todoRange.Start = end
                return ContinueMatch.asBool()
        }, span.AsRange())
}

// String implements Stringer.
func (f *llrbFrontier) String() string {
        var buf strings.Builder
        f.tree.Do(func(i interval.Interface) bool {
                if buf.Len() != 0 {
                        buf.WriteString(` `)
                }
                buf.WriteString(i.(*llrbFrontierEntry).String())
                return false
        })
        return buf.String()
}

// Len implements Frontier.
func (f *llrbFrontier) Len() int {
        return f.tree.Len()
}

// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package span

import (
        "fmt"
        "math/rand"
        "strings"

        "github.com/cockroachdb/cockroach/pkg/roachpb"
        "github.com/cockroachdb/cockroach/pkg/util/hlc"
        "github.com/cockroachdb/cockroach/pkg/util/interval"
        "github.com/cockroachdb/errors"
)

func checkContiguousFrontier(f Frontier) (startKey, endKey []byte, retErr error) {
        // Iterate frontier to make sure it is sane.
        prev := struct {
                s  roachpb.Span
                ts hlc.Timestamp
        }{}

        frontierSpan := f.PeekFrontierSpan()
        frontierTS := f.Frontier()
        sawFrontierSpan := false

        f.Entries(func(s roachpb.Span, ts hlc.Timestamp) (done OpResult) {
                if s.Equal(frontierSpan) && ts.Equal(frontierTS) {
                        sawFrontierSpan = true
                }

                if prev.s.Key == nil && prev.s.EndKey == nil {
                        prev.s = s
                        prev.ts = ts
                        startKey = s.Key
                        endKey = s.EndKey
                        return ContinueMatch
                }

                if s.Key.Equal(prev.s.EndKey) {
                        // Contiguous spans with the same timestamps are expected to be merged.
                        // However, LLRB based frontier has some gaps in its merge logic, so just
                        // let it be.
                        if useBtreeFrontier && ts.Equal(prev.ts) {
                                retErr = errors.Newf("expected ranges with equal timestamp to be merged, found %s and %s: %s", prev.s, s, f)
                                return StopMatch
                        }
                } else {
                        // We expect frontier entries to be contiguous.
                        retErr = errors.Newf("expected contiguous entries, found gap between %s and %s: %s", prev.s, s, f)
                        return StopMatch
                }

                endKey = s.EndKey
                prev.s = s
                prev.ts = ts
                return ContinueMatch
        })

        if !sawFrontierSpan {
                return startKey, endKey, errors.Newf("expected to find frontier span %s@%s: %s", frontierSpan, frontierTS, f)
        }

        return startKey, endKey, retErr
}

// forwardWithErrorCheck forwards span timestamp.
// It verifies if the returned error is consistent with the input span.
func forwardWithErrorCheck(f Frontier, s roachpb.Span, wall int64) error {
        if _, err := f.Forward(s, hlc.Timestamp{WallTime: wall}); err != nil {
                switch s.Key.Compare(s.EndKey) {
                case 1:
                        if !errors.Is(err, interval.ErrInvertedRange) {
                                return errors.Wrapf(err, "expected inverted span error for span %s", s)
                        }
                case 0:
                        if len(s.Key) == 0 && len(s.EndKey) == 0 {
                                if !errors.Is(err, interval.ErrNilRange) {
                                        return errors.Wrapf(err, "expected nil range error for span %s", s)
                                }
                        } else if !errors.Is(err, interval.ErrEmptyRange) {
                                return errors.Wrapf(err, "expected empty span error for span %s", s)
                        }
                default:
                        return errors.Wrapf(err, "f=%s", f)
                }
        }
        return nil
}

// symbols that can make up spans.
var spanSymbols = []byte("@$0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")

type spanMaker struct {
        rnd        *rand.Rand
        numSymbols int
        starts     []interval.Comparable
}

func newSpanMaker(numSymbols int, rnd *rand.Rand) (*spanMaker, roachpb.Span) {
        m := &spanMaker{
                rnd:        rnd,
                numSymbols: numSymbols,
        }
        span := roachpb.Span{
                Key:    roachpb.Key{'A'},
                EndKey: roachpb.Key{'z'},
        }
        return m, span
}

func (m *spanMaker) rndKey() interval.Comparable {
        var key []byte
        for n := 1 + m.rnd.Intn(m.numSymbols); n > 0; n-- {
                key = append(key, spanSymbols[m.rnd.Intn(len(spanSymbols))])
        }
        return key
}

func (m *spanMaker) rndSpan() roachpb.Span {
        var startKey interval.Comparable

        if len(m.starts) > 0 && m.rnd.Int()%37 == 0 {
                // With some probability use previous starting point.
                startKey = append(startKey, m.starts[m.rnd.Intn(len(m.starts))]...)
                // Just for fun, nudge start a bit forward or back.
                if dice := m.rnd.Intn(3) - 1; dice != 0 {
                        startKey[len(startKey)-1] += byte(dice)
                }
        } else {
                // Generate a new start.
                startKey = m.rndKey()
                m.starts = append(m.starts, startKey)
        }

        endKey := m.rndKey()
        // With some probability, make startKey prefix of endKey.
        if m.rnd.Int()%97 == 0 {
                endKey = append(startKey, endKey...)
        }

        if startKey.Equal(endKey) {
                endKey = append(endKey, spanSymbols[m.rnd.Intn(len(spanSymbols))])
        }

        if endKey.Compare(startKey) < 0 {
                startKey, endKey = endKey, startKey
        }
        if endKey.Equal(startKey) {
                panic(roachpb.Span{Key: roachpb.Key(startKey), EndKey: roachpb.Key(endKey)}.String())
        }
        return roachpb.Span{Key: roachpb.Key(startKey), EndKey: roachpb.Key(endKey)}
}

const maxHistory = 64

// captureHistoryFrontier is a Frontier that captures history
// of forward calls in order to make it easier to reproduce fuzz test failures.
// See TestForwardInvertedSpan.
type SpanFrontier = Frontier
type captureHistoryFrontier struct {
        SpanFrontier
        history []string
}

func (f *captureHistoryFrontier) Forward(span roachpb.Span, ts hlc.Timestamp) (bool, error) {
        f.history = append(f.history,
                fmt.Sprintf(`advanceFrontier(t, f, makeSpan(%q, %q), %d)`, span.Key, span.EndKey, ts.WallTime))
        if len(f.history) > maxHistory {
                f.history = append([]string{}, f.history[1:]...)
        }
        return f.SpanFrontier.Forward(span, ts)
}

func (f *captureHistoryFrontier) History() string {
        return strings.Join(f.history, "\n")
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import (
        "bytes"
        "fmt"
        io "io"
        "math/rand"
        "strings"
        "text/tabwriter"
        "unicode/utf8"

        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/redact"
)

// GetSingleRune decodes the string s as a single rune if possible.
func GetSingleRune(s string) (rune, error) {
        if s == "" {
                return 0, nil
        }
        r, sz := utf8.DecodeRuneInString(s)
        if r == utf8.RuneError {
                return 0, errors.Errorf("invalid character: %s", s)
        }
        if sz != len(s) {
                return r, errors.New("must be only one character")
        }
        return r, nil
}

// ToLowerSingleByte returns the lowercase of a given single ASCII byte.
// A non ASCII byte is returned unchanged.
func ToLowerSingleByte(b byte) byte {
        if b >= 'A' && b <= 'Z' {
                return 'a' + (b - 'A')
        }
        return b
}

// TruncateString truncates a string to a given number of runes.
func TruncateString(s string, maxRunes int) string {
        // This is a fast path (len(s) is an upper bound for RuneCountInString).
        if len(s) <= maxRunes {
                return s
        }
        n := utf8.RuneCountInString(s)
        if n <= maxRunes {
                return s
        }
        // Fast path for ASCII strings.
        if len(s) == n {
                return s[:maxRunes]
        }
        i := 0
        for pos := range s {
                if i == maxRunes {
                        return s[:pos]
                }
                i++
        }
        // This code should be unreachable.
        return s
}

// RemoveTrailingSpaces splits the input string into lines, trims any trailing
// spaces from each line, then puts the lines back together.
//
// Any newlines at the end of the input string are ignored.
//
// The output string always ends in a newline.
func RemoveTrailingSpaces(input string) string {
        lines := strings.TrimRight(input, "\n")
        var buf bytes.Buffer
        for _, line := range strings.Split(lines, "\n") {
                fmt.Fprintf(&buf, "%s\n", strings.TrimRight(line, " "))
        }
        return buf.String()
}

// StringListBuilder helps printing out lists of items. See
// MakeStringListBuilder.
type StringListBuilder struct {
        begin, separator, end string

        // started is true if we had at least one entry (and thus wrote out <begin>).
        started bool
}

// MakeStringListBuilder creates a StringListBuilder, which is used to print out
// lists of items. Sample usage:
//
//        b := MakeStringListBuilder("(", ", ", ")")
//        b.Add(&buf, "x")
//        b.Add(&buf, "y")
//        b.Finish(&buf) // By now, we wrote "(x, y)".
//
// If Add is not called, nothing is written.
func MakeStringListBuilder(begin, separator, end string) StringListBuilder {
        return StringListBuilder{
                begin:     begin,
                separator: separator,
                end:       end,
                started:   false,
        }
}

func (b *StringListBuilder) prepareToAdd(w io.Writer) {
        if b.started {
                _, _ = w.Write([]byte(b.separator))
        } else {
                _, _ = w.Write([]byte(b.begin))
                b.started = true
        }
}

// Add an item to the list.
func (b *StringListBuilder) Add(w io.Writer, val string) {
        b.prepareToAdd(w)
        _, _ = w.Write([]byte(val))
}

// Addf is a format variant of Add.
func (b *StringListBuilder) Addf(w io.Writer, format string, args ...interface{}) {
        b.prepareToAdd(w)
        fmt.Fprintf(w, format, args...)
}

// Finish must be called after all the elements have been added.
func (b *StringListBuilder) Finish(w io.Writer) {
        if b.started {
                _, _ = w.Write([]byte(b.end))
        }
}

// ExpandTabsInRedactableBytes expands tabs in the redactable byte
// slice, so that columns are aligned. The correctness of this
// function depends on the assumption that the `tabwriter` does not
// replace characters.
func ExpandTabsInRedactableBytes(s redact.RedactableBytes) (redact.RedactableBytes, error) {
        var buf bytes.Buffer
        tw := tabwriter.NewWriter(&buf, 2, 1, 2, ' ', 0)
        if _, err := tw.Write([]byte(s)); err != nil {
                return nil, err
        }
        if err := tw.Flush(); err != nil {
                return nil, err
        }
        return redact.RedactableBytes(buf.Bytes()), nil
}

// RandString generates a random string of the desired length from the
// input alphabet.
func RandString(rng *rand.Rand, length int, alphabet string) string {
        buf := make([]byte, length)
        for i := range buf {
                buf[i] = alphabet[rng.Intn(len(alphabet))]
        }
        return string(buf)
}

// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package syncutil

import (
        "math"
        "sync/atomic"
)

// AtomicFloat64 mimics the atomic types in the sync/atomic standard library,
// but for the float64 type. If you'd like to implement additional methods,
// consider checking out the expvar Float type for guidance:
// https://golang.org/src/expvar/expvar.go?s=2188:2222#L69
type AtomicFloat64 struct {
        val atomic.Uint64
}

// Store atomically stores a float64 value.
func (f *AtomicFloat64) Store(val float64) {
        f.val.Store(math.Float64bits(val))
}

// Load atomically loads tha float64 value.
func (f *AtomicFloat64) Load() float64 {
        return math.Float64frombits(f.val.Load())
}

// Add atomically adds delta to the float64 value and returns the new value.
func (f *AtomicFloat64) Add(delta float64) (new float64) {
        for {
                oldInt := f.val.Load()
                oldFloat := math.Float64frombits(oldInt)
                newFloat := oldFloat + delta
                newInt := math.Float64bits(newFloat)
                if f.val.CompareAndSwap(oldInt, newInt) {
                        return newFloat
                }
        }
}

// StoreIfHigher atomically stores the given value if it is higher than the
// current value (in which case the given value is returned; otherwise the
// existing value is returned).
func (f *AtomicFloat64) StoreIfHigher(new float64) (val float64) {
        newInt := math.Float64bits(new)
        for {
                oldInt := f.val.Load()
                oldFloat := math.Float64frombits(oldInt)
                if oldFloat > new {
                        return oldFloat
                }
                if f.val.CompareAndSwap(oldInt, newInt) {
                        return new
                }
        }
}

// AtomicString gives you atomic-style APIs for string.
type AtomicString struct {
        s atomic.Value
}

// Set atomically sets str as new value.
func (s *AtomicString) Set(val string) {
        s.s.Store(val)
}

// Get atomically returns the current value.
func (s *AtomicString) Get() string {
        val := s.s.Load()
        if val == nil {
                return ""
        }
        return val.(string)
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in licenses/BSD-golang.txt.

// This code originated in Go's sync package.

package syncutil

import (
        "sync/atomic"
        "unsafe"
)

// Map is like a Go map[K]*V but is safe for concurrent use by multiple
// goroutines without additional locking or coordination.
// Loads, stores, and deletes run in amortized constant time.
//
// The Map type is specialized. Most code should use a plain Go map instead,
// with separate locking or coordination, for better type safety and to make it
// easier to maintain other invariants along with the map content.
//
// The Map type is optimized for two common use cases: (1) when the entry for a
// given key is only ever written once but read many times, as in caches that
// only grow, or (2) when multiple goroutines read, write, and overwrite entries
// for disjoint sets of keys. In these two cases, use of an Map may
// significantly reduce lock contention compared to a Go map paired with a
// separate Mutex or RWMutex.
//
// Nil values are not supported; to use an Map as a set store a dummy non-nil
// pointer instead of nil. The Set type is provided for this purpose.
//
// The zero Map is valid and empty.
//
// An Map must not be copied after first use.
//
// In the terminology of the Go memory model, Map arranges that a write
// operation “synchronizes before” any read operation that observes the effect
// of the write, where read and write operations are defined as follows.
// Load, LoadAndDelete, LoadOrStore are read operations;
// Delete, LoadAndDelete, and Store are write operations;
// and LoadOrStore is a write operation when it returns loaded set to false.
type Map[K comparable, V any] struct {
        mu Mutex

        // read contains the portion of the map's contents that are safe for
        // concurrent access (with or without mu held).
        //
        // The read field itself is always safe to load, but must only be stored with
        // mu held.
        //
        // Entries stored in read may be updated concurrently without mu, but updating
        // a previously-expunged entry requires that the entry be copied to the dirty
        // map and unexpunged with mu held.
        read atomic.Pointer[readOnly[K, V]]

        // dirty contains the portion of the map's contents that require mu to be
        // held. To ensure that the dirty map can be promoted to the read map quickly,
        // it also includes all of the non-expunged entries in the read map.
        //
        // Expunged entries are not stored in the dirty map. An expunged entry in the
        // clean map must be unexpunged and added to the dirty map before a new value
        // can be stored to it.
        //
        // If the dirty map is nil, the next write to the map will initialize it by
        // making a shallow copy of the clean map, omitting stale entries.
        dirty map[K]*entry[V]

        // misses counts the number of loads since the read map was last updated that
        // needed to lock mu to determine whether the key was present.
        //
        // Once enough misses have occurred to cover the cost of copying the dirty
        // map, the dirty map will be promoted to the read map (in the unamended
        // state) and the next store to the map will make a new dirty copy.
        misses int
}

// readOnly is an immutable struct stored atomically in the Map.read field.
type readOnly[K comparable, V any] struct {
        m       map[K]*entry[V]
        amended bool // true if the dirty map contains some key not in m.
}

// An entry is a slot in the map corresponding to a particular key.
type entry[V any] struct {
        // p points to the value stored for the entry.
        //
        // If p == nil, the entry has been deleted, and either m.dirty == nil or
        // m.dirty[key] is e.
        //
        // If p == expunged, the entry has been deleted, m.dirty != nil, and the entry
        // is missing from m.dirty.
        //
        // Otherwise, the entry is valid and recorded in m.read.m[key] and, if m.dirty
        // != nil, in m.dirty[key].
        //
        // An entry can be deleted by atomic replacement with nil: when m.dirty is
        // next created, it will atomically replace nil with expunged and leave
        // m.dirty[key] unset.
        //
        // An entry's associated value can be updated by atomic replacement, provided
        // p != expunged. If p == expunged, an entry's associated value can be updated
        // only after first setting m.dirty[key] = e so that lookups using the dirty
        // map find the entry.
        p atomic.Pointer[V]
}

func newEntry[V any](v *V) *entry[V] {
        e := &entry[V]{}
        e.p.Store(v)
        return e
}

func (m *Map[K, V]) loadReadOnly() readOnly[K, V] {
        if p := m.read.Load(); p != nil {
                return *p
        }
        return readOnly[K, V]{}
}

// Load returns the value stored in the map for a key, or nil if no
// value is present.
// The ok result indicates whether value was found in the map.
func (m *Map[K, V]) Load(key K) (value *V, ok bool) {
        read := m.loadReadOnly()
        e, ok := read.m[key]
        if !ok && read.amended {
                func() {
                        m.mu.Lock()
                        defer m.mu.Unlock()
                        // Avoid reporting a spurious miss if m.dirty got promoted while we were
                        // blocked on m.mu. (If further loads of the same key will not miss, it's
                        // not worth copying the dirty map for this key.)
                        read = m.loadReadOnly()
                        e, ok = read.m[key]
                        if !ok && read.amended {
                                e, ok = m.dirty[key]
                                // Regardless of whether the entry was present, record a miss: this key
                                // will take the slow path until the dirty map is promoted to the read
                                // map.
                                m.missLocked()
                        }
                }()
        }
        if !ok {
                return nil, false
        }
        return e.load()
}

func (e *entry[V]) load() (value *V, ok bool) {
        p := e.p.Load()
        if p == nil || p == e.expunged() {
                return nil, false
        }
        return p, true
}

// Store sets the value for a key.
func (m *Map[K, V]) Store(key K, value *V) {
        m.assertNotNil(value)

        read := m.loadReadOnly()
        if e, ok := read.m[key]; ok && e.tryStore(value) {
                return
        }

        m.mu.Lock()
        defer m.mu.Unlock()
        read = m.loadReadOnly()
        if e, ok := read.m[key]; ok {
                if e.unexpungeLocked() {
                        // The entry was previously expunged, which implies that there is a
                        // non-nil dirty map and this entry is not in it.
                        m.dirty[key] = e
                }
                e.storeLocked(value)
        } else if e, ok := m.dirty[key]; ok {
                e.storeLocked(value)
        } else {
                if !read.amended {
                        // We're adding the first new key to the dirty map.
                        // Make sure it is allocated and mark the read-only map as incomplete.
                        m.dirtyLocked()
                        m.read.Store(&readOnly[K, V]{m: read.m, amended: true})
                }
                m.dirty[key] = newEntry(value)
        }
}

// assertNotNil asserts that a provided value to store is non-nil. Map does not
// support nil values because nil is used to indicate that an entry has been
// deleted. Callers should use a dummy non-nil pointer instead of nil.
func (*Map[K, V]) assertNotNil(v *V) {
        if v == nil {
                panic("syncutil.Map: store with a nil value is unsupported")
        }
}

// tryStore stores a value if the entry has not been expunged.
//
// If the entry is expunged, tryStore returns false and leaves the entry
// unchanged.
func (e *entry[V]) tryStore(v *V) bool {
        for {
                p := e.p.Load()
                if p == e.expunged() {
                        return false
                }
                if e.p.CompareAndSwap(p, v) {
                        return true
                }
        }
}

// unexpungeLocked ensures that the entry is not marked as expunged.
//
// If the entry was previously expunged, it must be added to the dirty map
// before m.mu is unlocked.
func (e *entry[V]) unexpungeLocked() (wasExpunged bool) {
        return e.p.CompareAndSwap(e.expunged(), nil)
}

// storeLocked unconditionally stores a value to the entry.
//
// The entry must be known not to be expunged.
func (e *entry[V]) storeLocked(v *V) {
        e.p.Store(v)
}

// LoadOrStore returns the existing value for the key if present.
// Otherwise, it stores and returns the given value.
// The loaded result is true if the value was loaded, false if stored.
func (m *Map[K, V]) LoadOrStore(key K, value *V) (actual *V, loaded bool) {
        m.assertNotNil(value)

        // Avoid locking if it's a clean hit.
        read := m.loadReadOnly()
        if e, ok := read.m[key]; ok {
                actual, loaded, ok = e.tryLoadOrStore(value)
                if ok {
                        return actual, loaded
                }
        }

        m.mu.Lock()
        defer m.mu.Unlock()
        read = m.loadReadOnly()
        if e, ok := read.m[key]; ok {
                if e.unexpungeLocked() {
                        m.dirty[key] = e
                }
                actual, loaded, _ = e.tryLoadOrStore(value)
        } else if e, ok := m.dirty[key]; ok {
                actual, loaded, _ = e.tryLoadOrStore(value)
                m.missLocked()
        } else {
                if !read.amended {
                        // We're adding the first new key to the dirty map.
                        // Make sure it is allocated and mark the read-only map as incomplete.
                        m.dirtyLocked()
                        m.read.Store(&readOnly[K, V]{m: read.m, amended: true})
                }
                m.dirty[key] = newEntry(value)
                actual, loaded = value, false
        }

        return actual, loaded
}

// tryLoadOrStore atomically loads or stores a value if the entry is not
// expunged.
//
// If the entry is expunged, tryLoadOrStore leaves the entry unchanged and
// returns with ok==false.
func (e *entry[V]) tryLoadOrStore(v *V) (actual *V, loaded, ok bool) {
        p := e.p.Load()
        if p == e.expunged() {
                return nil, false, false
        }
        if p != nil {
                return p, true, true
        }

        for {
                if e.p.CompareAndSwap(nil, v) {
                        return v, false, true
                }
                p = e.p.Load()
                if p == e.expunged() {
                        return nil, false, false
                }
                if p != nil {
                        return p, true, true
                }
        }
}

// LoadAndDelete deletes the value for a key, returning the previous value if any.
// The loaded result reports whether the key was present.
func (m *Map[K, V]) LoadAndDelete(key K) (value *V, loaded bool) {
        read := m.loadReadOnly()
        e, ok := read.m[key]
        if !ok && read.amended {
                func() {
                        m.mu.Lock()
                        defer m.mu.Unlock()
                        read = m.loadReadOnly()
                        e, ok = read.m[key]
                        if !ok && read.amended {
                                e, ok = m.dirty[key]
                                delete(m.dirty, key)
                                // Regardless of whether the entry was present, record a miss: this key
                                // will take the slow path until the dirty map is promoted to the read
                                // map.
                                m.missLocked()
                        }
                }()
        }
        if ok {
                return e.delete()
        }
        return nil, false
}

// Delete deletes the value for a key.
func (m *Map[K, V]) Delete(key K) {
        m.LoadAndDelete(key)
}

func (e *entry[V]) delete() (value *V, hadValue bool) {
        for {
                p := e.p.Load()
                if p == nil || p == e.expunged() {
                        return nil, false
                }
                if e.p.CompareAndSwap(p, nil) {
                        return p, true
                }
        }
}

// Range calls f sequentially for each key and value present in the map.
// If f returns false, range stops the iteration.
//
// Range does not necessarily correspond to any consistent snapshot of the Map's
// contents: no key will be visited more than once, but if the value for any key
// is stored or deleted concurrently (including by f), Range may reflect any
// mapping for that key from any point during the Range call. Range does not
// block other methods on the receiver; even f itself may call any method on m.
//
// Range may be O(N) with the number of elements in the map even if f returns
// false after a constant number of calls.
func (m *Map[K, V]) Range(f func(key K, value *V) bool) {
        // We need to be able to iterate over all of the keys that were already
        // present at the start of the call to Range.
        // If read.amended is false, then read.m satisfies that property without
        // requiring us to hold m.mu for a long time.
        read := m.loadReadOnly()
        if read.amended {
                // m.dirty contains keys not in read.m. Fortunately, Range is already O(N)
                // (assuming the caller does not break out early), so a call to Range
                // amortizes an entire copy of the map: we can promote the dirty copy
                // immediately!
                func() {
                        m.mu.Lock()
                        defer m.mu.Unlock()
                        read = m.loadReadOnly()
                        if read.amended {
                                // Don't let read escape directly, otherwise it will allocate even
                                // when read.amended is false. Instead, constrain the allocation to
                                // just this branch.
                                newRead := &readOnly[K, V]{m: m.dirty}
                                m.read.Store(newRead)
                                read = *newRead
                                m.dirty = nil
                                m.misses = 0
                        }
                }()
        }

        for k, e := range read.m {
                v, ok := e.load()
                if !ok {
                        continue
                }
                if !f(k, v) {
                        break
                }
        }
}

func (m *Map[K, V]) missLocked() {
        m.misses++
        if m.misses < len(m.dirty) {
                return
        }
        m.read.Store(&readOnly[K, V]{m: m.dirty})
        m.dirty = nil
        m.misses = 0
}

func (m *Map[K, V]) dirtyLocked() {
        if m.dirty != nil {
                return
        }

        read := m.loadReadOnly()
        m.dirty = make(map[K]*entry[V], len(read.m))
        for k, e := range read.m {
                if !e.tryExpungeLocked() {
                        m.dirty[k] = e
                }
        }
}

// expunged is an untyped arbitrary pointer that marks entries which have been
// deleted from the dirty map.
//
// It is safe for the size of the expunged pointer to differ from the size of a
// pointer to a V because the expunged pointer is never dereferenced. However,
// we use a large allocation for the expunged pointer to suppress checkptr
// assertions under race. Bump this size if you see checkptr failures.
var expunged = unsafe.Pointer(new([1 << 14]byte /* 16KiB */))

// expunged returns the typed arbitrary value that marks entries which have been
// deleted from the dirty map.
func (e *entry[V]) expunged() *V {
        return (*V)(expunged)
}

func (e *entry[V]) tryExpungeLocked() (isExpunged bool) {
        p := e.p.Load()
        for p == nil {
                if e.p.CompareAndSwap(nil, e.expunged()) {
                        return true
                }
                p = e.p.Load()
        }
        return p == e.expunged()
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

//go:build !deadlock && !race

package syncutil

import "sync"

// DeadlockEnabled is true if the deadlock detector is enabled.
const DeadlockEnabled = false

// A Mutex is a mutual exclusion lock.
type Mutex struct {
        sync.Mutex
}

// AssertHeld may panic if the mutex is not locked (but it is not required to
// do so). Functions which require that their callers hold a particular lock
// may use this to enforce this requirement more directly than relying on the
// race detector.
//
// Note that we do not require the lock to be held by any particular thread,
// just that some thread holds the lock. This is both more efficient and allows
// for rare cases where a mutex is locked in one thread and used in another.
func (m *Mutex) AssertHeld() {
}

// An RWMutex is a reader/writer mutual exclusion lock.
type RWMutex struct {
        sync.RWMutex
}

// AssertHeld may panic if the mutex is not locked for writing (but it is not
// required to do so). Functions which require that their callers hold a
// particular lock may use this to enforce this requirement more directly than
// relying on the race detector.
//
// Note that we do not require the exclusive lock to be held by any particular
// thread, just that some thread holds the lock. This is both more efficient
// and allows for rare cases where a mutex is locked in one thread and used in
// another.
func (rw *RWMutex) AssertHeld() {
}

// AssertRHeld may panic if the mutex is not locked for reading (but it is not
// required to do so). If the mutex is locked for writing, it is also considered
// to be locked for reading. Functions which require that their callers hold a
// particular lock may use this to enforce this requirement more directly than
// relying on the race detector.
//
// Note that we do not require the shared lock to be held by any particular
// thread, just that some thread holds the lock. This is both more efficient
// and allows for rare cases where a mutex is locked in one thread and used in
// another.
func (rw *RWMutex) AssertRHeld() {
}

// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package syncutil

import (
        "context"
        "sync"
        "time"
)

// TracedLock is like Lock, but logs a trace event using the provided context if
// the lock acquisition is slow.
func (m *Mutex) TracedLock(ctx context.Context) { tracedLock(ctx, m) }

// TracedLock is like Lock, but logs a trace event using the provided context if
// the lock acquisition is slow.
func (rw *RWMutex) TracedLock(ctx context.Context) { tracedLock(ctx, rw) }

// TracedRLock is like RLock, but logs a trace event using the provided context
// if the lock acquisition is slow.
func (rw *RWMutex) TracedRLock(ctx context.Context) { tracedLock(ctx, rw.rTryLocker()) }

// TimedLock is like Lock, but returns the time it took to acquire the lock.
func (m *Mutex) TimedLock() time.Duration { return timedLock(m) }

// TimedLock is like Lock, but returns the time it took to acquire the lock.
func (rw *RWMutex) TimedLock() time.Duration { return timedLock(rw) }

// TimedRLock is like RLock, but returns the time it took to acquire the lock.
func (rw *RWMutex) TimedRLock() time.Duration { return timedLock(rw.rTryLocker()) }

// rTryLocker returns a tryLocker interface that implements the Lock, Unlock,
// and TryLock methods by calling rw.RLock, rw.RUnlock, and rw.TryRLock,
// respectively.
func (rw *RWMutex) rTryLocker() tryLocker { return (*rTryLocker)(rw) }

type rTryLocker RWMutex

func (r *rTryLocker) Lock()         { (*RWMutex)(r).RLock() }
func (r *rTryLocker) Unlock()       { (*RWMutex)(r).RUnlock() }
func (r *rTryLocker) TryLock() bool { return (*RWMutex)(r).TryRLock() }

// tryLocker extends the sync.Locker interface with a TryLock method.
type tryLocker interface {
        sync.Locker
        TryLock() bool
}

// tracedLock is like l.Lock, but logs a trace event using the provided context
// if the lock acquisition is slow.
//
// Explanation of logic:
//
// The function begins with a fast-path call to TryLock. Most mutexes are
// uncontended and TryLock amounts to a single atomic CAS. If the CAS succeeds,
// no additional work is needed. If the CAS fails, we move on to the slow-path.
//
// On the slow path, we first check if expensive logging is enabled. If not, we
// simply call Lock without checking the time. We only time the acquisition if
// expensive logging is enabled. If we do time the acquisition and it is slow,
// we log a warning message to the logs/trace.
//
// It could be a reasonable choice to switch the order of the TryLock and
// ExpensiveLogEnabled checks. However, we expect that most mutex acquisitions
// will be uncontended and the TryLock check is cheaper than the expensive log
// check.
//
// NOTE: because of this ordering of fast-path checks, it does not make sense to
// implement tracedLock using timedLock, though it is conceptually an extension
// of that functionality.
func tracedLock(ctx context.Context, l tryLocker) {
        if enableTracedLockFastPath && l.TryLock() {
                return // fast-path
        }
        const vLevel = 3
        if !LogExpensiveLogEnabled(ctx, vLevel) {
                l.Lock()
                return
        }
        start := time.Now()
        l.Lock()
        if dur := time.Since(start); dur >= slowLockLogThreshold {
                LogVEventfDepth(ctx, 2 /* depth */, vLevel, "slow mutex acquisition took %s", dur)
        }
}

// timedLock is like l.Lock, but returns the time it took to acquire the lock.
// Returns 0 if the lock was acquired without blocking.
func timedLock(l tryLocker) time.Duration {
        if enableTracedLockFastPath && l.TryLock() {
                return 0 // fast-path
        }
        start := time.Now()
        l.Lock()
        return time.Since(start)
}

// enableTracedLockFastPath is used in tests to disable the fast-path of
// tracedLock and timedLock.
var enableTracedLockFastPath = true

// slowLockLogThreshold is the threshold at which a mutex acquisition is
// considered slow enough to log. It is a variable and not constant so that it
// can be changed in tests.
var slowLockLogThreshold = 500 * time.Microsecond

// LogExpensiveLogEnabled is injected from pkg/util/log to avoid an import
// cycle. This also allows it to be mocked out in tests.
//
// See log.ExpensiveLogEnabled for more details.
var LogExpensiveLogEnabled = func(ctx context.Context, level int32) bool { return false }

// LogVEventfDepth is injected from pkg/util/log to avoid an import
// cycle. This also allows it to be mocked out in tests.
//
// See log.LogVEventfDepth for more details.
var LogVEventfDepth = func(ctx context.Context, depth int, level int32, format string, args ...interface{}) {}

// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package syncutil

// Set is like a Go map[V]struct{} but is safe for concurrent use by multiple
// goroutines without additional locking or coordination.
// Loads, stores, and deletes run in amortized constant time.
//
// See the Map type for more details.
type Set[V comparable] struct {
        m Map[V, struct{}]
}

// Contains returns whether the value is stored in the set.
//
// Both Add and Remove return whether the value previously existed in the set,
// so Contains is typically not needed when later calling one of those methods.
// In fact, its use in logic that later mutates the set may be racy if not
// carefully considered, as there is no synchronization between the Contains
// call and the subsequent Add or Remove call.
func (s *Set[V]) Contains(value V) bool {
        _, ok := s.m.Load(value)
        return ok
}

// dummyValue is a placeholder value for all values in the map.
var dummyValue = new(struct{})

// Add adds the value to the set.
//
// Returns whether the value was added (true) or was already present (false).
func (s *Set[V]) Add(value V) bool {
        _, loaded := s.m.LoadOrStore(value, dummyValue)
        return !loaded
}

// Remove removes the value from the set.
//
// Returns whether the value was present and removed (true) or was not present
// and not removed (false).
func (s *Set[V]) Remove(value V) bool {
        _, loaded := s.m.LoadAndDelete(value)
        return loaded
}

// Range calls f sequentially for each value present in the set.
// If f returns false, range stops the iteration.
//
// See Map.Range for more details.
func (s *Set[V]) Range(f func(value V) bool) {
        s.m.Range(func(value V, _ *struct{}) bool {
                return f(value)
        })
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

//go:build linux

package util

import (
        "fmt"

        "github.com/cockroachdb/cockroach/pkg/util/randutil"
)

func init() {
        r, _ := randutil.NewTestRand()
        // 127.255.255.255 is special (broadcast), so choose values less
        // than 255.
        a := r.Intn(255)
        b := r.Intn(255)
        c := r.Intn(255)
        IsolatedTestAddr = NewUnresolvedAddr("tcp", fmt.Sprintf("127.%d.%d.%d:0", a, b, c))
}

// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import (
        "time"

        "github.com/cockroachdb/cockroach/pkg/util/grunning"
        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
)

// CPUStopWatch is a wrapper around cpuStopWatch that is safe to use
// concurrently. If CPUStopWatch is nil, all operations are no-ops and no
// locks are acquired.
type CPUStopWatch struct {
        mu struct {
                syncutil.Mutex
                cpuStopWatch cpuStopWatch
        }
}

// NewCPUStopWatch returns a new CPUStopWatch if the grunning library is
// supported. Otherwise, it returns nil.
func NewCPUStopWatch() *CPUStopWatch {
        if grunning.Supported() {
                return &CPUStopWatch{}
        }
        return nil
}

// Start starts the CPU stop watch if it hasn't already been started.
func (w *CPUStopWatch) Start() {
        if w == nil {
                return
        }
        w.mu.Lock()
        defer w.mu.Unlock()
        w.mu.cpuStopWatch.start()
}

// Stop stops the CPU stop watch if it hasn't already been stopped and
// accumulates the CPU time that has been spent since it was started. If the
// CPU stop watch has already been stopped, it is a noop.
func (w *CPUStopWatch) Stop() {
        if w == nil {
                return
        }
        w.mu.Lock()
        defer w.mu.Unlock()
        w.mu.cpuStopWatch.stop()
}

// Elapsed returns the total CPU time measured by the stop watch so far.
func (w *CPUStopWatch) Elapsed() time.Duration {
        if w == nil {
                return 0
        }
        w.mu.Lock()
        defer w.mu.Unlock()
        return w.mu.cpuStopWatch.elapsed()
}

// cpuStopWatch is a utility stop watch for measuring CPU time spent by a
// component. It can be safely started and stopped multiple times, but is
// not safe to use concurrently. If cpuStopWatch is nil, all operations are
// no-ops.
//
// Note that the grunning library uses a non-monotonic clock, so the measured
// duration between clock start and stop can come out as negative. This can lead
// to discrepancies in the measured CPU time - for example, a child stopwatch
// that is started and stopped while a parent stopwatch is running can rarely
// measure a larger CPU time duration than the parent. Users must be prepared to
// handle this case.
type cpuStopWatch struct {
        startCPU time.Duration
        totalCPU time.Duration
}

func (w *cpuStopWatch) start() {
        if w == nil {
                return
        }
        w.startCPU = grunning.Time()
}

func (w *cpuStopWatch) stop() {
        if w == nil {
                return
        }
        w.totalCPU += grunning.Elapsed(w.startCPU, grunning.Time())
}

func (w *cpuStopWatch) elapsed() time.Duration {
        if w == nil {
                return 0
        }
        return w.totalCPU
}

// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import (
        "container/heap"
        "container/list"
        "fmt"
        "sort"
        "time"

        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
)

// ManualTime is a testing implementation of TimeSource.
type ManualTime struct {
        mu struct {
                syncutil.Mutex
                now    time.Time
                timers manualTimerQueue
                // tickers is a list with element type *manualTicker.
                tickers list.List
        }
}

// NewManualTime constructs a new ManualTime.
func NewManualTime(initialTime time.Time) *ManualTime {
        mt := ManualTime{}
        mt.mu.now = initialTime
        mt.mu.timers = manualTimerQueue{
                m: make(map[*manualTimer]int),
        }
        mt.mu.tickers.Init()
        return &mt
}

var _ TimeSource = (*ManualTime)(nil)

// Now returns the current time.
func (m *ManualTime) Now() time.Time {
        m.mu.Lock()
        defer m.mu.Unlock()
        return m.mu.now
}

// Since implements TimeSource interface
func (m *ManualTime) Since(t time.Time) time.Duration {
        return m.Now().Sub(t)
}

// NewTimer constructs a new timer.
func (m *ManualTime) NewTimer() TimerI {
        return &manualTimer{m: m}
}

// NewTicker creates a new ticker.
func (m *ManualTime) NewTicker(duration time.Duration) TickerI {
        if duration <= 0 {
                panic("non-positive interval for NewTicker")
        }
        m.mu.Lock()
        defer m.mu.Unlock()

        t := &manualTicker{
                m:        m,
                duration: duration,
                nextTick: m.mu.now.Add(duration),
                // We allocate a big buffer so that sending a tick never blocks.
                ch: make(chan time.Time, 10000),
        }
        t.element = m.mu.tickers.PushBack(t)
        return t
}

// Advance forwards the current time by the given duration.
func (m *ManualTime) Advance(duration time.Duration) {
        m.AdvanceTo(m.Now().Add(duration))
}

// Backwards moves the clock back by duration. Duration is expected to be
// positive, and it will be subtracted from the current time.
func (m *ManualTime) Backwards(duration time.Duration) {
        if duration < 0 {
                panic("invalid negative duration")
        }
        m.mu.Lock()
        defer m.mu.Unlock()
        // No timers fire when the clock goes backwards.
        m.mu.now = m.mu.now.Add(-duration)
}

// AdvanceTo advances the current time to t. If t is earlier than the current
// time then AdvanceTo is a no-op.
func (m *ManualTime) AdvanceTo(now time.Time) {
        m.mu.Lock()
        defer m.mu.Unlock()
        m.advanceToLocked(now)
}

// MustAdvanceTo is like AdvanceTo, except it panics if now is below m's current time.
func (m *ManualTime) MustAdvanceTo(now time.Time) {
        m.mu.Lock()
        defer m.mu.Unlock()
        if now.Before(m.mu.now) {
                panic(fmt.Sprintf("attempting to move ManualTime backwards from %s to %s", m.mu.now, now))
        }
        m.advanceToLocked(now)
}

func (m *ManualTime) advanceToLocked(now time.Time) {
        if !now.After(m.mu.now) {
                return
        }
        m.mu.now = now

        // Fire off any timers.
        for m.mu.timers.Len() > 0 {
                next := m.mu.timers.heap[0]
                if next.at.After(now) {
                        break
                }
                next.ch <- next.at
                heap.Pop(&m.mu.timers)
        }

        // Fire off any tickers.
        for e := m.mu.tickers.Front(); e != nil; e = e.Next() {
                t := e.Value.(*manualTicker)
                for !t.nextTick.After(now) {
                        select {
                        case t.ch <- t.nextTick:
                        default:
                                panic("ticker channel full")
                        }
                        t.nextTick = t.nextTick.Add(t.duration)
                }
        }
}

func (m *ManualTime) add(mt *manualTimer) {
        m.mu.Lock()
        defer m.mu.Unlock()

        if !mt.at.After(m.mu.now) {
                mt.ch <- mt.at
        } else {
                heap.Push(&m.mu.timers, mt)
        }
}

func (m *ManualTime) removeTimer(mt *manualTimer) bool {
        m.mu.Lock()
        defer m.mu.Unlock()
        if idx, ok := m.mu.timers.m[mt]; ok {
                heap.Remove(&m.mu.timers, idx)
                return true
        }
        return false
}

func (m *ManualTime) removeTicker(t *manualTicker) {
        m.mu.Lock()
        defer m.mu.Unlock()
        if t.element != nil {
                m.mu.tickers.Remove(t.element)
                t.element = nil
        }
}

// Timers returns a snapshot of the timestamps of the pending timers.
func (m *ManualTime) Timers() []time.Time {
        m.mu.Lock()
        defer m.mu.Unlock()
        timers := make([]time.Time, m.mu.timers.Len())
        for i, t := range m.mu.timers.heap {
                timers[i] = t.at
        }
        sort.Slice(timers, func(i, j int) bool {
                return timers[i].Before(timers[j])
        })
        return timers
}

type manualTimerQueue struct {
        // m maintains the index for a timer in heap.
        m    map[*manualTimer]int
        heap []*manualTimer
}

var _ heap.Interface = (*manualTimerQueue)(nil)

func (m *manualTimerQueue) Len() int {
        return len(m.heap)
}

func (m *manualTimerQueue) Less(i, j int) bool {
        return m.heap[i].at.Before(m.heap[j].at)
}

func (m *manualTimerQueue) Swap(i, j int) {
        m.heap[i], m.heap[j] = m.heap[j], m.heap[i]
        m.m[m.heap[i]] = i
        m.m[m.heap[j]] = j
}

func (m *manualTimerQueue) Push(x interface{}) {
        mt := x.(*manualTimer)
        m.m[mt] = len(m.heap)
        m.heap = append(m.heap, mt)
}

func (m *manualTimerQueue) Pop() interface{} {
        lastIdx := len(m.heap) - 1
        ret := m.heap[lastIdx]
        delete(m.m, ret)
        m.heap = m.heap[:lastIdx]
        return ret
}

type manualTimer struct {
        m  *ManualTime
        at time.Time
        ch chan time.Time
}

var _ TimerI = (*manualTimer)(nil)

func (m *manualTimer) Reset(duration time.Duration) {
        m.Stop()
        m.at = m.m.Now().Add(duration)
        m.ch = make(chan time.Time, 1)
        m.m.add(m)
}

func (m *manualTimer) Stop() bool {
        removed := m.m.removeTimer(m)
        m.ch = nil
        m.at = time.Time{}
        return removed
}

func (m *manualTimer) Ch() <-chan time.Time {
        return m.ch
}

func (m *manualTimer) MarkRead() {}

type manualTicker struct {
        m       *ManualTime
        element *list.Element

        duration time.Duration
        nextTick time.Time
        ch       chan time.Time
}

// Reset is part of the TickerI interface.
func (t *manualTicker) Reset(duration time.Duration) {
        panic("not implemented")
}

// Stop is part of the TickerI interface.
func (t *manualTicker) Stop() {
        t.m.removeTicker(t)
}

// Ch is part of the TickerI interface.
func (t *manualTicker) Ch() <-chan time.Time {
        return t.ch
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import (
        "time"

        "github.com/cockroachdb/cockroach/pkg/util/grunning"
        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
)

// StopWatch is a utility stop watch that can be safely started and stopped
// multiple times and can be used concurrently.
type StopWatch struct {
        mu struct {
                syncutil.Mutex
                // started is true if the stop watch has been started and haven't been
                // stopped after that.
                started bool
                // startedAt is the time when the stop watch was started.
                startedAt time.Time
                // elapsed is the total time measured by the stop watch (i.e. between
                // all Starts and Stops).
                elapsed time.Duration
                // timeSource is the source of time used by the stop watch. It is always
                // timeutil.Now except for tests.
                timeSource func() time.Time
                // cpuStopWatch is used to track CPU usage. It may be nil, in which case any
                // operations on it are no-ops.
                cpuStopWatch *cpuStopWatch
        }
}

// NewStopWatch creates a new StopWatch.
func NewStopWatch() *StopWatch {
        return newStopWatch(Now)
}

// NewStopWatchWithCPU creates a new StopWatch that will track CPU usage in
// addition to wall-clock time.
func NewStopWatchWithCPU() *StopWatch {
        w := newStopWatch(Now)
        if grunning.Supported() {
                w.mu.cpuStopWatch = &cpuStopWatch{}
        }
        return w
}

// NewTestStopWatch create a new StopWatch with the given time source. It is
// used for testing only.
func NewTestStopWatch(timeSource func() time.Time) *StopWatch {
        return newStopWatch(timeSource)
}

func newStopWatch(timeSource func() time.Time) *StopWatch {
        w := &StopWatch{}
        w.mu.timeSource = timeSource
        return w
}

// Start starts the stop watch if it hasn't already been started.
func (w *StopWatch) Start() {
        w.mu.Lock()
        defer w.mu.Unlock()
        if !w.mu.started {
                w.mu.started = true
                w.mu.startedAt = w.mu.timeSource()
                w.mu.cpuStopWatch.start()
        }
}

// Stop stops the stop watch if it hasn't already been stopped and accumulates
// the duration that elapsed since it was started. If the stop watch has
// already been stopped, it is a noop.
func (w *StopWatch) Stop() {
        w.mu.Lock()
        defer w.mu.Unlock()
        if w.mu.started {
                w.mu.started = false
                w.mu.elapsed += w.mu.timeSource().Sub(w.mu.startedAt)
                w.mu.cpuStopWatch.stop()
        }
}

// Elapsed returns the total time measured by the stop watch so far.
func (w *StopWatch) Elapsed() time.Duration {
        w.mu.Lock()
        defer w.mu.Unlock()
        return w.mu.elapsed
}

// ElapsedCPU returns the total CPU time measured by the stop watch so far. It
// returns zero if cpuStopWatch is nil (which is the case if NewStopWatchWithCPU
// was not called or the platform does not support grunning).
func (w *StopWatch) ElapsedCPU() time.Duration {
        w.mu.Lock()
        defer w.mu.Unlock()
        return w.mu.cpuStopWatch.elapsed()
}

// LastStartedAt returns the time the stopwatch was last started, and a bool
// indicating if the stopwatch is currently started.
func (w *StopWatch) LastStartedAt() (startedAt time.Time, started bool) {
        w.mu.Lock()
        defer w.mu.Unlock()
        return w.mu.startedAt, w.mu.started
}

// TestTimeSource is a source of time that remembers when it was created (in
// terms of the real time) and returns the time based on its creation time and
// the number of "advances" it has had. It is used for testing only.
type TestTimeSource struct {
        initTime time.Time
        counter  int64
}

// NewTestTimeSource create a new TestTimeSource.
func NewTestTimeSource() *TestTimeSource {
        return &TestTimeSource{initTime: Now()}
}

// Now tells the current time according to t.
func (t *TestTimeSource) Now() time.Time {
        return t.initTime.Add(time.Duration(t.counter))
}

// Advance advances the current time according to t by 1 nanosecond.
func (t *TestTimeSource) Advance() {
        t.counter++
}

// Elapsed returns how much time has passed since t has been created. Note that
// it is equal to the number of advances in nanoseconds.
func (t *TestTimeSource) Elapsed() time.Duration {
        return time.Duration(t.counter)
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import (
        "strings"
        "time"
        "unsafe"
)

// LibPQTimePrefix is the prefix lib/pq prints time-type datatypes with.
const LibPQTimePrefix = "0000-01-01"

// Now returns the current UTC time.
//
// We've decided in times immemorial that always returning UTC is a good policy
// across the cluster so that all the timestamps print uniformly across
// different nodes, and also because we were afraid that timestamps leak into
// SQL Datums, and there the timestamp matters. Years later, it's not clear
// whether this was a good decision since it's forcing the nasty implementation
// below.
func Now() time.Time {
        t := time.Now()
        // HACK: instead of doing t = t.UTC(), we reach inside the
        // struct and set the location manually. UTC() strips the monotonic clock reading
        // from t, for no good reason: https://groups.google.com/g/golang-nuts/c/dyPTdi6oem8
        // Stripping the monotonic part has bad consequences:
        // 1. We lose the benefits of the monotonic clock reading.
        // 2. On OSX, only the monotonic clock seems to have nanosecond resolution. If
        // we strip it, we only get microsecond resolution. Besides generally sucking,
        // microsecond resolution is not enough to guarantee that consecutive
        // timeutil.Now() calls don't return the same instant. This trips up some of
        // our tests, which assume that they can measure any duration of time.
        // 3. time.Since(t) does one less system calls when t has a monotonic reading,
        // making it twice as fast as otherwise:
        // https://cs.opensource.google/go/go/+/refs/tags/go1.17.2:src/time/time.go;l=878;drc=refs%2Ftags%2Fgo1.17.2
        x := (*timeLayout)(unsafe.Pointer(&t))
        x.loc = nil // nil means UTC
        return t
}

// NowNoMono is like Now(), but it strips down the monotonic part of the
// timestamp. This is useful for getting timestamps that rounds-trip through
// various channels that strip out the monotonic part - for example yaml
// marshaling.
func NowNoMono() time.Time {
        // UTC has the side-effect of stripping the nanos.
        return time.Now().UTC()
}

// StripMono returns a copy of t with its monotonic clock reading stripped. This
// is useful for getting a time.Time that compares == with another one that
// might not have the mono part. time.Time is meant to be compared with
// Time.Equal() (which ignores the mono), not with ==, but sometimes we have a
// time.Time in a bigger struct and we want to use require.Equal() or such.
func StripMono(t time.Time) time.Time {
        // UTC() has the side-effect of stripping the mono part.
        return t.UTC()
}

// timeLayout mimics time.Time, exposing all the fields. We do an unsafe cast of
// a time.Time to this in order to set the location.
type timeLayout struct {
        wall uint64
        ext  int64
        loc  *time.Location
}

// Since returns the time elapsed since t.
// It is shorthand for Now().Sub(t), but more efficient.
func Since(t time.Time) time.Duration {
        return time.Since(t)
}

// Until returns the duration until t.
// It is shorthand for t.Sub(Now()), but more efficient.
func Until(t time.Time) time.Duration {
        return time.Until(t)
}

// UnixEpoch represents the Unix epoch, January 1, 1970 UTC.
var UnixEpoch = time.Unix(0, 0).UTC()

// FromUnixMicros returns the UTC time.Time corresponding to the given Unix
// time, usec microseconds since UnixEpoch. In Go's current time.Time
// implementation, all possible values for us can be represented as a time.Time.
func FromUnixMicros(us int64) time.Time {
        return time.Unix(us/1e6, (us%1e6)*1e3).UTC()
}

// FromUnixNanos returns the UTC time.Time corresponding to the given Unix
// time, ns nanoseconds since UnixEpoch. In Go's current time.Time
// implementation, all possible values for ns can be represented as a time.Time.
func FromUnixNanos(ns int64) time.Time {
        return time.Unix(ns/1e9, ns%1e9).UTC()
}

// ToUnixMicros returns t as the number of microseconds elapsed since UnixEpoch.
// Fractional microseconds are rounded, half up, using time.Round. Similar to
// time.Time.UnixNano, the result is undefined if the Unix time in microseconds
// cannot be represented by an int64.
func ToUnixMicros(t time.Time) int64 {
        return t.Unix()*1e6 + int64(t.Round(time.Microsecond).Nanosecond())/1e3
}

// Unix wraps time.Unix ensuring that the result is in UTC instead of Local.
//
// The process of deriving the args to construct a specific time.Time:
//
//        // say we want to construct timestamp "294277-01-01 23:59:59.999999 +0000 UTC"
//        tm := time.Date(294277, 1, 1, 23, 59, 59, 999999000, time.UTC)
//        // get the args of "timeutil.Unix"
//        sec := tm.Unix()
//        nsec := int64(tm.Nanosecond())
//        // verify
//        fmt.Println(tm == time.Unix(sec, nsec).UTC())
func Unix(sec, nsec int64) time.Time {
        return time.Unix(sec, nsec).UTC()
}

// ReplaceLibPQTimePrefix replaces unparsable lib/pq dates used for timestamps
// (0000-01-01) with timestamps that can be parsed by date libraries.
func ReplaceLibPQTimePrefix(s string) string {
        if strings.HasPrefix(s, LibPQTimePrefix) {
                return "1970-01-01" + s[len(LibPQTimePrefix):]
        }
        return s
}

// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import "time"

// TimeSource is used to interact with clocks and timers. Generally exposed for
// testing.
type TimeSource interface {
        Now() time.Time
        Since(t time.Time) time.Duration
        NewTimer() TimerI
        NewTicker(duration time.Duration) TickerI
}

// TimerI is an interface wrapping Timer.
type TimerI interface {

        // Reset will set the timer to notify on Ch() after duration.
        Reset(duration time.Duration)

        // Stop prevents the Timer from firing.
        Stop() bool

        // Ch returns the channel which will be notified when the timer reaches its
        // time.
        Ch() <-chan time.Time

        // MarkRead should be called when a value is read from the Ch() channel.
        // If MarkRead is not called, the resetting the timer is less efficient.
        MarkRead()
}

// TickerI is an interface wrapping Ticker.
type TickerI interface {
        // Reset stops a ticker and resets its period to the specified duration. The
        // next tick will arrive after the new period elapses.
        Reset(duration time.Duration)

        // Stop turns off a ticker. After Stop, no more ticks will be sent. Stop does
        // not close the channel, to prevent a concurrent goroutine reading from the
        // channel from seeing an erroneous "tick".
        Stop()

        // Ch returns the channel on which the ticks are delivered.
        Ch() <-chan time.Time
}

// DefaultTimeSource is a TimeSource using the system clock.
type DefaultTimeSource struct{}

var _ TimeSource = DefaultTimeSource{}

// Now returns timeutil.Now().
func (DefaultTimeSource) Now() time.Time {
        return Now()
}

// Since implements TimeSource interface
func (DefaultTimeSource) Since(t time.Time) time.Duration {
        return Since(t)
}

// NewTimer returns a TimerI wrapping *Timer.
func (DefaultTimeSource) NewTimer() TimerI {
        return (*timer)(new(Timer))
}

// NewTicker creates a new ticker.
func (DefaultTimeSource) NewTicker(duration time.Duration) TickerI {
        return (*ticker)(time.NewTicker(duration))
}

type timer Timer

var _ TimerI = (*timer)(nil)

func (t *timer) Reset(duration time.Duration) {
        (*Timer)(t).Reset(duration)
}

func (t *timer) Stop() bool {
        return (*Timer)(t).Stop()
}

func (t *timer) Ch() <-chan time.Time {
        return t.C
}

func (t *timer) MarkRead() {
        t.Read = true
}

type ticker time.Ticker

var _ TickerI = (*ticker)(nil)

// Reset is part of the TickerI interface.
func (t *ticker) Reset(duration time.Duration) {
        (*time.Ticker)(t).Reset(duration)
}

// Stop is part of the TickerI interface.
func (t *ticker) Stop() {
        (*time.Ticker)(t).Stop()
}

// Ch is part of the TickerI interface.
func (t *ticker) Ch() <-chan time.Time {
        return (*time.Ticker)(t).C
}

// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import (
        "fmt"
        "regexp"
        "strconv"
        "strings"
        "time"

        "github.com/cockroachdb/errors"
)

const (
        offsetBoundSecs = 167*60*60 + 59*60
        // PG supports UTC hour offsets in the range [-167, 167].
        maxUTCHourOffset          = 167
        maxUTCHourOffsetInSeconds = maxUTCHourOffset * 60 * 60
)

var timezoneOffsetRegex = regexp.MustCompile(`(?i)^(GMT|UTC)?([+-])?(\d{1,3}(:[0-5]?\d){0,2})$`)

// FixedTimeZoneOffsetToLocation creates a time.Location with an offset and a
// time zone string.
func FixedTimeZoneOffsetToLocation(offset int, origRepr string) *time.Location {
        // The offset name always should be normalized to upper-case for UTC/GMT.
        return time.FixedZone(strings.ToUpper(origRepr), offset)
}

// TimeZoneOffsetToLocation takes an offset and name that can be marshaled by
// crdb between nodes and creates a time.Location.
// Note that the display time zone is always shown with ISO sign convention.
func TimeZoneOffsetToLocation(offset int) *time.Location {
        origRepr := secondsToHoursMinutesSeconds(offset)
        if offset <= 0 {
                origRepr = fmt.Sprintf("<-%s>+%s", origRepr, origRepr)
        } else {
                origRepr = fmt.Sprintf("<+%s>-%s", origRepr, origRepr)
        }

        return time.FixedZone(origRepr, offset)
}

// TimeZoneStringToLocationStandard is an option for the standard to use
// for parsing in TimeZoneStringToLocation.
type TimeZoneStringToLocationStandard uint32

const (
        // TimeZoneStringToLocationISO8601Standard parses int UTC offsets as *east* of
        // the GMT line, e.g. `-5` would be 'America/New_York' without daylight savings.
        TimeZoneStringToLocationISO8601Standard TimeZoneStringToLocationStandard = iota
        // TimeZoneStringToLocationPOSIXStandard parses int UTC offsets as *west* of the
        // GMT line, e.g. `+5` would be 'America/New_York' without daylight savings.
        TimeZoneStringToLocationPOSIXStandard
)

// TimeZoneStringToLocation transforms a string into a time.Location. It
// supports the usual locations and also time zones with fixed offsets created
// by FixedTimeZoneOffsetToLocation().
func TimeZoneStringToLocation(
        locStr string, std TimeZoneStringToLocationStandard,
) (*time.Location, error) {
        // ParseTimeZoneOffset uses strconv.ParseFloat, which returns an error when
        // parsing fails that is expensive to construct. We first check if the string
        // contains any non-numeric characters to see if we can skip attempting to
        // parse it as a timezone offset. `/` is also checked since that character
        // appears in most timezone names. Since UTC is the most commonly used
        // timezone, we also check for that explicitly to avoid calling ContainsAny if
        // possible.
        containsNonNumeric := strings.EqualFold(locStr, "utc") || strings.ContainsAny(locStr, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/")
        if !containsNonNumeric {
                offset, _, parsed := ParseTimeZoneOffset(locStr, std)
                if parsed {
                        if offset < -maxUTCHourOffsetInSeconds || offset > maxUTCHourOffsetInSeconds {
                                return nil, errors.New("UTC timezone offset is out of range.")
                        }
                        return TimeZoneOffsetToLocation(offset), nil
                }
        }

        // The time may just be a raw int value. Similar to the above, in order to
        // avoid constructing an expensive error, we first check if the string
        // contains any non-numeric characters to see if we can skip attempting to
        // parse it as an int.
        if !containsNonNumeric {
                intVal, err := strconv.ParseInt(locStr, 10, 64)
                if err == nil {
                        // Parsing an int has different behavior for POSIX and ISO8601.
                        if std == TimeZoneStringToLocationPOSIXStandard {
                                intVal *= -1
                        }
                        if intVal < -maxUTCHourOffset || intVal > maxUTCHourOffset {
                                return nil, errors.New("UTC timezone offset is out of range.")
                        }
                        return TimeZoneOffsetToLocation(int(intVal) * 60 * 60), nil
                }
        }

        locTransforms := []func(string) string{
                func(s string) string { return s },
                strings.ToUpper,
                strings.ToTitle,
        }
        for _, transform := range locTransforms {
                if loc, err := LoadLocation(transform(locStr)); err == nil {
                        return loc, nil
                }
        }

        tzOffset, ok := timeZoneOffsetStringConversion(locStr, std)
        if !ok {
                return nil, errors.Newf("could not parse %q as time zone", locStr)
        }
        return FixedTimeZoneOffsetToLocation(int(tzOffset), locStr), nil
}

// ParseTimeZoneOffset takes the string representation of a time.Location
// created by TimeZoneOffsetToLocation and parses it to the offset and the
// original representation specified by the user. The bool returned is true if
// parsing was successful.
// The offset is formatted <-%s>+%s or <+%s>+%s.
// A string with whitespace padding optionally followed by a (+/-)
// and a float should be able to be parsed. Example: "    +10.5" is parsed in
// PG and displayed as  <+10:06:36>-10:06:36.
func ParseTimeZoneOffset(
        location string, standard TimeZoneStringToLocationStandard,
) (offset int, origRepr string, success bool) {
        if strings.HasPrefix(location, "<") {
                // The string has the format <+HH:MM:SS>-HH:MM:SS or <-HH:MM:SS>+HH:MM:SS.
                // Parse the time between the < >.
                // Grab the time from between the < >.
                regexPattern, err := regexp.Compile(`\<[+-].*\>`)
                if err != nil {
                        return 0, "", false
                }
                origRepr = regexPattern.FindString(location)
                origRepr = strings.TrimPrefix(origRepr, "<")
                origRepr = strings.TrimSuffix(origRepr, ">")

                offsetMultiplier := 1
                if strings.HasPrefix(origRepr, "-") {
                        offsetMultiplier = -1
                }

                origRepr = strings.Trim(origRepr, "+")
                origRepr = strings.Trim(origRepr, "-")

                // Parse HH:MM:SS time.
                offset = hoursMinutesSecondsToSeconds(origRepr)
                offset *= offsetMultiplier

                return offset, location, true
        }

        // Try parsing the string in the format whitespaces optionally followed by
        // (+/-) followed immediately by a float.
        origRepr = strings.TrimSpace(location)
        origRepr = strings.TrimPrefix(origRepr, "+")

        multiplier := 1
        if strings.HasPrefix(origRepr, "-") {
                multiplier = -1
                origRepr = strings.TrimPrefix(origRepr, "-")
        }

        if standard == TimeZoneStringToLocationPOSIXStandard {
                multiplier *= -1
        }

        f, err := strconv.ParseFloat(origRepr, 64)
        if err != nil {
                return 0, "", false
        }

        origRepr = floatToHoursMinutesSeconds(f)
        offset = hoursMinutesSecondsToSeconds(origRepr)
        return multiplier * offset, origRepr, true
}

// timeZoneOffsetStringConversion converts a time string to offset seconds.
// Supported time zone strings: GMT/UTC±[00:00:00 - 169:59:00].
// Seconds/minutes omittable and is case insensitive.
// By default, anything with a UTC/GMT prefix, or with : characters are POSIX.
// Whole integers can be POSIX or ISO8601 standard depending on the std variable.
func timeZoneOffsetStringConversion(
        s string, std TimeZoneStringToLocationStandard,
) (offset int64, ok bool) {
        submatch := timezoneOffsetRegex.FindStringSubmatch(strings.ReplaceAll(s, " ", ""))
        if len(submatch) == 0 {
                return 0, false
        }
        hasUTCPrefix := submatch[1] != ""
        prefix := submatch[2]
        timeString := submatch[3]

        offsets := strings.Split(timeString, ":")
        offset = int64(hoursMinutesSecondsToSeconds(timeString))

        // GMT/UTC prefix, colons and POSIX standard characters have "opposite" timezones.
        if hasUTCPrefix || len(offsets) > 1 || std == TimeZoneStringToLocationPOSIXStandard {
                offset *= -1
        }
        if prefix == "-" {
                offset *= -1
        }

        if offset > offsetBoundSecs || offset < -offsetBoundSecs {
                return 0, false
        }
        return offset, true
}

// The timestamp must be of one of the following formats:
//
//        HH
//        HH:MM
//        HH:MM:SS
func hoursMinutesSecondsToSeconds(timeString string) int {
        var (
                hoursString   = "0"
                minutesString = "0"
                secondsString = "0"
        )
        offsets := strings.Split(timeString, ":")
        if strings.Contains(timeString, ":") {
                hoursString, minutesString = offsets[0], offsets[1]
                if len(offsets) == 3 {
                        secondsString = offsets[2]
                }
        } else {
                hoursString = timeString
        }

        hours, _ := strconv.ParseInt(hoursString, 10, 64)
        minutes, _ := strconv.ParseInt(minutesString, 10, 64)
        seconds, _ := strconv.ParseInt(secondsString, 10, 64)
        return int((hours * 60 * 60) + (minutes * 60) + seconds)
}

// secondsToHoursMinutesSeconds converts seconds to a timestamp of the format
//
//        HH
//        HH:MM
//        HH:MM:SS
func secondsToHoursMinutesSeconds(totalSeconds int) string {
        secondsPerHour := 60 * 60
        secondsPerMinute := 60
        if totalSeconds < 0 {
                totalSeconds = totalSeconds * -1
        }
        hours := totalSeconds / secondsPerHour
        minutes := (totalSeconds - hours*secondsPerHour) / secondsPerMinute
        seconds := totalSeconds - hours*secondsPerHour - minutes*secondsPerMinute

        if seconds == 0 && minutes == 0 {
                return fmt.Sprintf("%02d", hours)
        } else if seconds == 0 {
                return fmt.Sprintf("%d:%d", hours, minutes)
        } else {
                // PG doesn't round, truncate precision.
                return fmt.Sprintf("%d:%d:%2.0d", hours, minutes, seconds)
        }
}

// floatToHoursMinutesSeconds converts a float to a HH:MM:SS.
// The minutes and seconds sections are only included in the precision is
// necessary.
// For example:
//
//        11.00 -> 11
//        11.5 -> 11:30
//        11.51 -> 11:30:36
func floatToHoursMinutesSeconds(f float64) string {
        hours := int(f)
        remaining := f - float64(hours)

        secondsPerHour := float64(60 * 60)
        totalSeconds := remaining * secondsPerHour
        minutes := int(totalSeconds / 60)
        seconds := totalSeconds - float64(minutes*60)

        if seconds == 0 && minutes == 0 {
                return fmt.Sprintf("%02d", hours)
        } else if seconds == 0 {
                return fmt.Sprintf("%d:%d", hours, minutes)
        } else {
                // PG doesn't round, truncate precision.
                return fmt.Sprintf("%d:%d:%2.0f", hours, minutes, seconds)
        }
}

// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import (
        "context"
        "time"

        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/redact"
)

// RunWithTimeout runs a function with a timeout, the same way you'd do with
// context.WithTimeout. It improves the opaque error messages returned by
// WithTimeout by augmenting them with the op string that is passed in.
func RunWithTimeout(
        ctx context.Context,
        op redact.RedactableString,
        timeout time.Duration,
        fn func(ctx context.Context) error,
) error {
        ctx, cancel := context.WithTimeout(ctx, timeout) // nolint:context
        defer cancel()
        start := Now()
        err := fn(ctx)
        if err != nil && errors.Is(ctx.Err(), context.DeadlineExceeded) {
                err = &TimeoutError{
                        operation: op,
                        timeout:   timeout,
                        took:      Since(start),
                        cause:     err,
                }
        }
        return err
}

// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import (
        "context"
        "fmt"
        "net"
        "time"

        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/errors/errorspb"
        "github.com/cockroachdb/redact"
        "github.com/gogo/protobuf/proto"
)

// TimeoutError is a wrapped ContextDeadlineExceeded error. It indicates that
// an operation didn't complete within its designated timeout.
type TimeoutError struct {
        // The operation that timed out.
        operation redact.RedactableString
        // The configured timeout.
        timeout time.Duration
        // The duration of the operation. This is usually expected to be the same as
        // the timeout, but can be longer if the timeout was not observed expediently
        // (because the ctx was not checked sufficiently often).
        took  time.Duration
        cause error
}

var _ error = (*TimeoutError)(nil)
var _ fmt.Formatter = (*TimeoutError)(nil)
var _ errors.SafeFormatter = (*TimeoutError)(nil)

// We implement net.Error the same way that context.DeadlineExceeded does, so
// that people looking for net.Error attributes will still find them.
var _ net.Error = (*TimeoutError)(nil)

// Operation returns the name of the operation that timed out.
func (t *TimeoutError) Operation() redact.RedactableString {
        return t.operation
}

func (t *TimeoutError) Error() string { return fmt.Sprintf("%v", t) }

// Format implements fmt.Formatter.
func (t *TimeoutError) Format(s fmt.State, verb rune) { errors.FormatError(t, s, verb) }

// SafeFormatError implements errors.SafeFormatter.
func (t *TimeoutError) SafeFormatError(p errors.Printer) (next error) {
        // NB: With RunWithTimeout(), it is possible for both the caller and the
        // callee to have set their own context timeout that is smaller than the
        // timeout set by RunWithTimeout. It is also possible for the operation to run
        // for much longer than the timeout, e.g. if the callee does not check the
        // context in a timely manner. The error message must make this clear.
        p.Printf("operation \"%s\" timed out", t.operation)
        if t.took != 0 {
                p.Printf(" after %s", t.took.Round(time.Millisecond))
        }
        p.Printf(" (given timeout %s)", t.timeout)
        return t.cause
}

// Timeout implements net.Error.
func (*TimeoutError) Timeout() bool { return true }

// Temporary implements net.Error.
func (*TimeoutError) Temporary() bool { return true }

// Cause implements Causer.
func (t *TimeoutError) Cause() error {
        return t.cause
}

// encodeTimeoutError serializes a TimeoutError.
func encodeTimeoutError(
        _ context.Context, err error,
) (msgPrefix string, safe []string, details proto.Message) {
        t := err.(*TimeoutError)
        details = &errorspb.StringsPayload{
                Details: []string{t.timeout.String(), t.took.String()},
        }
        msgPrefix = fmt.Sprintf("operation %q timed out after %s", t.operation, t.timeout)
        return msgPrefix, []string{string(t.operation)}, details
}

func decodeTimeoutError(
        ctx context.Context, cause error, msgPrefix string, safeDetails []string, payload proto.Message,
) error {
        m, ok := payload.(*errorspb.StringsPayload)
        if !ok || len(m.Details) < 1 || len(safeDetails) < 1 {
                // If this ever happens, this means some version of the library
                // (presumably future) changed the payload type, and we're
                // receiving this here. In this case, give up and let
                // DecodeError use the opaque type.
                return nil
        }
        op := redact.RedactableString(safeDetails[0])
        timeout, decodeErr := time.ParseDuration(m.Details[0])
        if decodeErr != nil {
                // Not encoded by our encode function. Bail out.
                return nil //nolint:returnerrcheck
        }
        var took time.Duration
        if len(m.Details) >= 2 {
                took, decodeErr = time.ParseDuration(m.Details[1])
                if decodeErr != nil {
                        // Not encoded by our encode function. Bail out.
                        return nil //nolint:returnerrcheck
                }
        }
        return &TimeoutError{
                operation: op,
                timeout:   timeout,
                took:      took,
                cause:     cause,
        }
}

func init() {
        errors.RegisterTypeMigration("github.com/cockroachdb/cockroach/pkg/util/contextutil",
                "*contextutil.TimeoutError", &TimeoutError{})

        pKey := errors.GetTypeKey(&TimeoutError{})
        errors.RegisterWrapperEncoder(pKey, encodeTimeoutError)
        errors.RegisterWrapperDecoder(pKey, decodeTimeoutError)

}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import (
        "sync"
        "time"
)

var timeTimerPool sync.Pool

// The Timer type represents a single event. When the Timer expires,
// the current time will be sent on Timer.C.
//
// This timer implementation is an abstraction around the standard
// library's time.Timer that provides a temporary workaround for the
// issue described in https://github.com/golang/go/issues/14038. As
// such, this timer should only be used when Reset is planned to
// be called continually in a loop. For this Reset pattern to work,
// Timer.Read must be set to true whenever a timestamp is read from
// the Timer.C channel. If Timer.Read is not set to true when the
// channel is read from, the next call to Timer.Reset will deadlock.
// This pattern looks something like:
//
//        var timer timeutil.Timer
//        defer timer.Stop()
//        for {
//            timer.Reset(wait)
//            select {
//            case <-timer.C:
//                timer.Read = true
//                ...
//            }
//        }
//
// Note that unlike the standard library's Timer type, this Timer will
// not begin counting down until Reset is called for the first time, as
// there is no constructor function. The zero value for Timer is ready
// to use.
//
// TODO(nvanbenschoten): follow https://github.com/golang/go/issues/37196
// and remove this abstraction once it's no longer needed. There's some
// recent progress in https://go-review.googlesource.com/c/go/+/568341.
type Timer struct {
        timer *time.Timer
        // C is a local "copy" of timer.C that can be used in a select case before
        // the timer has been initialized (via Reset).
        C    <-chan time.Time
        Read bool
}

// AsTimerI returns the Timer as a TimerI. This is helpful
// to write code that accepts a Timer in production and a manual
// timer in tests.
func (t *Timer) AsTimerI() TimerI {
        return (*timer)(t)
}

// Reset changes the timer to expire after duration d and returns
// the new value of the timer. This method includes the fix proposed
// in https://github.com/golang/go/issues/11513#issuecomment-157062583,
// but requires users of Timer to set Timer.Read to true whenever
// they successfully read from the Timer's channel.
func (t *Timer) Reset(d time.Duration) {
        if t.timer == nil {
                switch timer := timeTimerPool.Get(); timer {
                case nil:
                        t.timer = time.NewTimer(d)
                default:
                        t.timer = timer.(*time.Timer)
                        t.timer.Reset(d)
                }
                t.C = t.timer.C
                return
        }
        t.stopAndDrain()
        t.timer.Reset(d)
        t.Read = false
}

// Stop prevents the Timer from firing. It returns true if the call stops
// the timer, false if the timer has already expired, been stopped previously,
// or had never been initialized with a call to Timer.Reset. Stop does not
// close the channel, to prevent a read from succeeding incorrectly.
func (t *Timer) Stop() bool {
        var res bool
        if t.timer != nil {
                res = t.stopAndDrain()
                timeTimerPool.Put(t.timer)
        }
        *t = Timer{}
        return res
}

// stopAndDrain stops the underlying *time.Timer and drains the channel if the
// timer has already expired but the channel has not been read from. It returns
// true if the call stops the timer and false if the timer has already expired.
// t.timer must not be nil and must not have already been stopped.
func (t *Timer) stopAndDrain() bool {
        res := t.timer.Stop()
        if !res && !t.Read {
                // The timer expired, but the channel has not been read from. Drain it.
                <-t.C
                // Even though we did not stop the timer before it expired, the channel was
                // never read from and we had to drain it ourselves, so we consider the stop
                // attempt successful. For any caller consulting this return value, this is
                // an indication that after the call to Stop, the timer channel will remain
                // empty until the next call to Reset.
                res = true
        }
        return res
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package timeutil

import (
        "sort"
        "strings"
        "sync"
        "time"
        // embed tzdata in case system tzdata is not available.
        _ "time/tzdata"
)

//go:generate go run gen/main.go

// LoadLocation returns the time.Location with the given name.
// The name is taken to be a location name corresponding to a file
// in the IANA Time Zone database, such as "America/New_York".
//
// We do not use Go's time.LoadLocation() directly because it maps
// "Local" to the local time zone, whereas we want UTC.
func LoadLocation(name string) (*time.Location, error) {
        loweredName := strings.ToLower(name)
        switch loweredName {
        case "local", "default":
                loweredName = "utc"
                name = "UTC"
        }
        // If we know this is a lowercase name in tzdata, use the uppercase form.
        if v, ok := lowercaseTimezones[loweredName]; ok {
                // If this location is not found, we may have a case where the tzdata names
                // have different values than the system tz names.
                // If this is the case, allback onto the default logic, where the name is read
                // off other sources before tzdata.
                if loc, err := time.LoadLocation(v); err == nil {
                        return loc, nil
                }
        }
        return time.LoadLocation(name)
}

var tzsOnce sync.Once
var tzs []string

// TimeZones lists all supported timezones.
func TimeZones() []string {
        tzsOnce.Do(func() {
                tzs = make([]string, 0, len(lowercaseTimezones))
                for _, tz := range lowercaseTimezones {
                        tzs = append(tzs, tz)
                }
                sort.Strings(tzs)
        })
        return tzs
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import (
        "fmt"
        "math/rand"
        "sort"
)

// moveTopKToFront swaps elements in the range [start, end) so that all elements
// in the range [start, k) are <= than all elements in the range [k, end).
func moveTopKToFront(data sort.Interface, start, end, k int, rng *rand.Rand) {
        if k < start || k > end {
                panic(fmt.Sprintf("k (%d) outside of range [%d, %d)", k, start, end))
        }
        if k == start || k == end {
                return
        }

        // The strategy is to choose a random pivot and partition the data into
        // three regions: elements < pivot, elements == pivot, elements > pivot.
        //
        // We first partition into two regions: elements <= pivot and
        // elements > pivot and further refine the first region if necessary.

        // Choose a random pivot and move it to the front.
        data.Swap(start, start+rng.Intn(end-start))
        pivot := start
        l, r := start+1, end
        for l < r {
                // Invariants:
                //  - elements in the range [start, l) are <= pivot
                //  - elements in the range [r, end) are > pivot
                if !data.Less(pivot, l) {
                        l++
                } else if data.Less(pivot, r-1) {
                        r--
                } else {
                        data.Swap(l, r-1)
                        l++
                        r--
                }
        }
        mid := l
        // Everything in the range [start, mid) is <= than the pivot.
        // Everything in the range [mid, end) is > than the pivot.
        if k >= mid {
                // In this case, we eliminated at least the pivot (and all elements
                // equal to it).
                moveTopKToFront(data, mid, end, k, rng)
                return
        }

        // If we eliminated a decent amount of elements, we can recurse on [0, mid).
        // If the elements were distinct we would do this unconditionally, but in
        // general we could have a lot of elements equal to the pivot.
        if end-mid > (end-start)/4 {
                moveTopKToFront(data, start, mid, k, rng)
                return
        }

        // Now we work on the range [0, mid). Move everything that is equal to the
        // pivot to the back.
        data.Swap(pivot, mid-1)
        pivot = mid - 1
        for l, r = start, pivot-1; l <= r; {
                if data.Less(l, pivot) {
                        l++
                } else {
                        data.Swap(l, r)
                        r--
                }
        }
        // Now everything in the range [start, l) is < than the pivot. Everything in the
        // range [l, mid) is equal to the pivot. If k is in the [l, mid) range we
        // are done, otherwise we recurse on [start, l).
        if k <= l {
                moveTopKToFront(data, start, l, k, rng)
        }
}

// MoveTopKToFront moves the top K elements to the front. It makes O(n) calls to
// data.Less and data.Swap (with very high probability). It uses Hoare's
// selection algorithm (aka quickselect).
func MoveTopKToFront(data sort.Interface, k int) {
        if data.Len() <= k {
                return
        }
        // We want the call to be deterministic so we use a predictable seed.
        r := rand.New(rand.NewSource(int64(data.Len()*1000 + k)))
        moveTopKToFront(data, 0, data.Len(), k, r)
}

// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package uint128

import (
        "encoding/binary"
        "encoding/hex"

        "github.com/cockroachdb/errors"
)

// Uint128 is a big-endian 128 bit unsigned integer which wraps two uint64s.
type Uint128 struct {
        Hi, Lo uint64
}

// GetBytes returns a big-endian byte representation.
func (u Uint128) GetBytes() []byte {
        buf := make([]byte, 16)
        binary.BigEndian.PutUint64(buf[:8], u.Hi)
        binary.BigEndian.PutUint64(buf[8:], u.Lo)
        return buf
}

// AppendBytes appends big-endian byte representation to the
// buffer and returns the buffer.
func (u Uint128) AppendBytes(buf []byte) []byte {
        buf = binary.BigEndian.AppendUint64(buf, u.Hi)
        return binary.BigEndian.AppendUint64(buf, u.Lo)
}

// String returns a hexadecimal string representation.
func (u Uint128) String() string {
        return hex.EncodeToString(u.GetBytes())
}

// Equal returns whether or not the Uint128 are equivalent.
func (u Uint128) Equal(o Uint128) bool {
        return u.Hi == o.Hi && u.Lo == o.Lo
}

// Compare compares the two Uint128.
func (u Uint128) Compare(o Uint128) int {
        if u.Hi > o.Hi {
                return 1
        } else if u.Hi < o.Hi {
                return -1
        } else if u.Lo > o.Lo {
                return 1
        } else if u.Lo < o.Lo {
                return -1
        }
        return 0
}

// Add returns a new Uint128 incremented by n.
func (u Uint128) Add(n uint64) Uint128 {
        lo := u.Lo + n
        hi := u.Hi
        if u.Lo > lo {
                hi++
        }
        return Uint128{hi, lo}
}

// Sub returns a new Uint128 decremented by n.
func (u Uint128) Sub(n uint64) Uint128 {
        lo := u.Lo - n
        hi := u.Hi
        if u.Lo < lo {
                hi--
        }
        return Uint128{hi, lo}
}

// And returns a new Uint128 that is the bitwise AND of two Uint128 values.
func (u Uint128) And(o Uint128) Uint128 {
        return Uint128{u.Hi & o.Hi, u.Lo & o.Lo}
}

// Or returns a new Uint128 that is the bitwise OR of two Uint128 values.
func (u Uint128) Or(o Uint128) Uint128 {
        return Uint128{u.Hi | o.Hi, u.Lo | o.Lo}
}

// Xor returns a new Uint128 that is the bitwise XOR of two Uint128 values.
func (u Uint128) Xor(o Uint128) Uint128 {
        return Uint128{u.Hi ^ o.Hi, u.Lo ^ o.Lo}
}

// FromBytes parses the byte slice as a 128 bit big-endian unsigned integer.
// The caller is responsible for ensuring the byte slice contains 16 bytes.
func FromBytes(b []byte) Uint128 {
        hi := binary.BigEndian.Uint64(b[:8])
        lo := binary.BigEndian.Uint64(b[8:])
        return Uint128{hi, lo}
}

// FromString parses a hexadecimal string as a 128-bit big-endian unsigned integer.
func FromString(s string) (Uint128, error) {
        if len(s) > 32 {
                return Uint128{}, errors.Errorf("input string %s too large for uint128", s)
        }
        bytes, err := hex.DecodeString(s)
        if err != nil {
                return Uint128{}, errors.Wrapf(err, "could not decode %s as hex", s)
        }

        // Grow the byte slice if it's smaller than 16 bytes, by prepending 0s
        if len(bytes) < 16 {
                bytesCopy := make([]byte, 16)
                copy(bytesCopy[(16-len(bytes)):], bytes)
                bytes = bytesCopy
        }

        return FromBytes(bytes), nil
}

// FromInts takes in two unsigned 64-bit integers and constructs a Uint128.
func FromInts(hi uint64, lo uint64) Uint128 {
        return Uint128{hi, lo}
}

// Copyright 2015 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package util

import (
        "fmt"
        "net"
        "os"

        addrutil "github.com/cockroachdb/cockroach/pkg/util/netutil/addr"
)

// TestAddr is an address to use for test servers. Listening on port 0
// causes the kernel to allocate an unused port.
var TestAddr = NewUnresolvedAddr("tcp", "127.0.0.1:0")

// IsolatedTestAddr is initialized in testaddr_*.go

// IsolatedTestAddr is an address to use for tests that need extra
// isolation by using more addresses than 127.0.0.1 (support for this
// is platform-specific and only enabled on Linux). Both TestAddr and
// IsolatedTestAddr guarantee that the chosen port is not in use when
// allocated, but IsolatedTestAddr draws from a larger pool of
// addresses so that when tests are run in a tight loop the system is
// less likely to run out of available ports or give a port to one
// test immediately after it was closed by another.
//
// IsolatedTestAddr should be used for tests that open and close a
// large number of sockets, or tests which stop a server and rely on
// seeing a "connection refused" error afterwards. It cannot be used
// with tests that operate in secure mode since our test certificates
// are only valid for 127.0.0.1.
var IsolatedTestAddr *UnresolvedAddr

// MakeUnresolvedAddr populates an UnresolvedAddr from a network and raw
// address string.
func MakeUnresolvedAddr(network, addr string) UnresolvedAddr {
        return UnresolvedAddr{
                NetworkField: network,
                AddressField: addr,
        }
}

// NewUnresolvedAddr creates a new UnresolvedAddr from a network and raw
// address string.
func NewUnresolvedAddr(network, addr string) *UnresolvedAddr {
        return &UnresolvedAddr{
                NetworkField: network,
                AddressField: addr,
        }
}

// MakeUnresolvedAddrWithDefaults creates a new UnresolvedAddr from a network and
// raw address string, using the following defaults if not given:
//
// - Network: tcp
// - Host: local hostname or 127.0.0.1
// - Port: given default port
func MakeUnresolvedAddrWithDefaults(network, addr, defaultPort string) UnresolvedAddr {
        if network == "" {
                network = "tcp"
        }
        if host, port, err := addrutil.SplitHostPort(addr, defaultPort); err != nil {
                addr = net.JoinHostPort(addr, defaultPort)
        } else {
                if host == "" {
                        host, err = os.Hostname()
                        if err != nil {
                                host = "127.0.0.1"
                        }
                }
                addr = net.JoinHostPort(host, port)
        }
        return UnresolvedAddr{
                NetworkField: network,
                AddressField: addr,
        }
}

// Note that we make *UnresolvedAddr implement the net.Addr interface, not
// UnresolvedAddr. This is done because assigning a non-empty struct to an
// interface requires an allocation, while assigning a pointer to an interface
// is allocation free. Using an *UnresolvedAddr makes it both clear that an
// allocation is occurring and allows us to avoid an allocation when an
// UnresolvedAddr is a field of a struct (e.g. NodeDescriptor.Address).
var _ net.Addr = &UnresolvedAddr{}

// Network returns the address's network name.
func (a *UnresolvedAddr) Network() string {
        return a.NetworkField
}

// IsEmpty returns true if the address has no network or address specified.
func (a UnresolvedAddr) IsEmpty() bool {
        return a == (UnresolvedAddr{})
}

// String returns the address's string form.
func (a UnresolvedAddr) String() string {
        return a.AddressField
}

// Resolve attempts to resolve a into a net.Addr.
func (a UnresolvedAddr) Resolve() (net.Addr, error) {
        switch a.NetworkField {
        case "tcp", "tcp4", "tcp6":
                return net.ResolveTCPAddr(a.NetworkField, a.AddressField)
        case "udp", "udp4", "udp6":
                return net.ResolveUDPAddr(a.NetworkField, a.AddressField)
        case "unix", "unixgram", "unixpacket":
                return net.ResolveUnixAddr(a.NetworkField, a.AddressField)
        }
        return nil, fmt.Errorf("network %s not supported", a.NetworkField)
}

// Code generated by protoc-gen-gogo. DO NOT EDIT.
// source: util/unresolved_addr.proto

package util

import (
        fmt "fmt"
        _ "github.com/gogo/protobuf/gogoproto"
        proto "github.com/gogo/protobuf/proto"
        io "io"
        math "math"
        math_bits "math/bits"
)

// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf

// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package

// UnresolvedAddr is an unresolved version of net.Addr.
type UnresolvedAddr struct {
        NetworkField string `protobuf:"bytes,1,opt,name=network_field,json=networkField" json:"network_field"`
        AddressField string `protobuf:"bytes,2,opt,name=address_field,json=addressField" json:"address_field"`
}

func (m *UnresolvedAddr) Reset()      { *m = UnresolvedAddr{} }
func (*UnresolvedAddr) ProtoMessage() {}
func (*UnresolvedAddr) Descriptor() ([]byte, []int) {
        return fileDescriptor_e843f4480e4927e4, []int{0}
}
func (m *UnresolvedAddr) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *UnresolvedAddr) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        b = b[:cap(b)]
        n, err := m.MarshalToSizedBuffer(b)
        if err != nil {
                return nil, err
        }
        return b[:n], nil
}
func (m *UnresolvedAddr) XXX_Merge(src proto.Message) {
        xxx_messageInfo_UnresolvedAddr.Merge(m, src)
}
func (m *UnresolvedAddr) XXX_Size() int {
        return m.Size()
}
func (m *UnresolvedAddr) XXX_DiscardUnknown() {
        xxx_messageInfo_UnresolvedAddr.DiscardUnknown(m)
}

var xxx_messageInfo_UnresolvedAddr proto.InternalMessageInfo

func init() {
        proto.RegisterType((*UnresolvedAddr)(nil), "cockroach.util.UnresolvedAddr")
}

func init() { proto.RegisterFile("util/unresolved_addr.proto", fileDescriptor_e843f4480e4927e4) }

var fileDescriptor_e843f4480e4927e4 = []byte{
        // 215 bytes of a gzipped FileDescriptorProto
        0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x92, 0x2a, 0x2d, 0xc9, 0xcc,
        0xd1, 0x2f, 0xcd, 0x2b, 0x4a, 0x2d, 0xce, 0xcf, 0x29, 0x4b, 0x4d, 0x89, 0x4f, 0x4c, 0x49, 0x29,
        0xd2, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0xe2, 0x4b, 0xce, 0x4f, 0xce, 0x2e, 0xca, 0x4f, 0x4c,
        0xce, 0xd0, 0x03, 0xa9, 0x92, 0x12, 0x49, 0xcf, 0x4f, 0xcf, 0x07, 0x4b, 0xe9, 0x83, 0x58, 0x10,
        0x55, 0x4a, 0x05, 0x5c, 0x7c, 0xa1, 0x70, 0xed, 0x8e, 0x29, 0x29, 0x45, 0x42, 0x9a, 0x5c, 0xbc,
        0x79, 0xa9, 0x25, 0xe5, 0xf9, 0x45, 0xd9, 0xf1, 0x69, 0x99, 0xa9, 0x39, 0x29, 0x12, 0x8c, 0x0a,
        0x8c, 0x1a, 0x9c, 0x4e, 0x2c, 0x27, 0xee, 0xc9, 0x33, 0x04, 0xf1, 0x40, 0xa5, 0xdc, 0x40, 0x32,
        0x20, 0xa5, 0x20, 0x0b, 0x53, 0x8b, 0x8b, 0xa1, 0x4a, 0x99, 0x90, 0x95, 0x42, 0xa5, 0xc0, 0x4a,
        0xad, 0x38, 0x66, 0x2c, 0x90, 0x67, 0x78, 0xb1, 0x40, 0x9e, 0xd1, 0xc9, 0xfb, 0xc4, 0x43, 0x39,
        0x86, 0x13, 0x8f, 0xe4, 0x18, 0x2f, 0x3c, 0x92, 0x63, 0xbc, 0xf1, 0x48, 0x8e, 0xf1, 0xc1, 0x23,
        0x39, 0xc6, 0x09, 0x8f, 0xe5, 0x18, 0x2e, 0x3c, 0x96, 0x63, 0xb8, 0xf1, 0x58, 0x8e, 0x21, 0x4a,
        0x33, 0x3d, 0xb3, 0x24, 0xa3, 0x34, 0x49, 0x2f, 0x39, 0x3f, 0x57, 0x1f, 0xee, 0x89, 0x94, 0x24,
        0x04, 0x5b, 0xbf, 0x20, 0x3b, 0x5d, 0x1f, 0xe4, 0x29, 0x40, 0x00, 0x00, 0x00, 0xff, 0xff, 0xc2,
        0x21, 0xae, 0xea, 0x01, 0x01, 0x00, 0x00,
}

func (this *UnresolvedAddr) Equal(that interface{}) bool {
        if that == nil {
                return this == nil
        }

        that1, ok := that.(*UnresolvedAddr)
        if !ok {
                that2, ok := that.(UnresolvedAddr)
                if ok {
                        that1 = &that2
                } else {
                        return false
                }
        }
        if that1 == nil {
                return this == nil
        } else if this == nil {
                return false
        }
        if this.NetworkField != that1.NetworkField {
                return false
        }
        if this.AddressField != that1.AddressField {
                return false
        }
        return true
}
func (m *UnresolvedAddr) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *UnresolvedAddr) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *UnresolvedAddr) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        i -= len(m.AddressField)
        copy(dAtA[i:], m.AddressField)
        i = encodeVarintUnresolvedAddr(dAtA, i, uint64(len(m.AddressField)))
        i--
        dAtA[i] = 0x12
        i -= len(m.NetworkField)
        copy(dAtA[i:], m.NetworkField)
        i = encodeVarintUnresolvedAddr(dAtA, i, uint64(len(m.NetworkField)))
        i--
        dAtA[i] = 0xa
        return len(dAtA) - i, nil
}

func encodeVarintUnresolvedAddr(dAtA []byte, offset int, v uint64) int {
        offset -= sovUnresolvedAddr(v)
        base := offset
        for v >= 1<<7 {
                dAtA[offset] = uint8(v&0x7f | 0x80)
                v >>= 7
                offset++
        }
        dAtA[offset] = uint8(v)
        return base
}
func (m *UnresolvedAddr) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        l = len(m.NetworkField)
        n += 1 + l + sovUnresolvedAddr(uint64(l))
        l = len(m.AddressField)
        n += 1 + l + sovUnresolvedAddr(uint64(l))
        return n
}

func sovUnresolvedAddr(x uint64) (n int) {
        return int((uint32(math_bits.Len64(x|1)+6) * 37) >> 8)
}
func sozUnresolvedAddr(x uint64) (n int) {
        return sovUnresolvedAddr(uint64((x << 1) ^ uint64((int64(x) >> 63))))
}
func (m *UnresolvedAddr) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowUnresolvedAddr
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: UnresolvedAddr: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: UnresolvedAddr: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NetworkField", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowUnresolvedAddr
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthUnresolvedAddr
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthUnresolvedAddr
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NetworkField = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 2:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field AddressField", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowUnresolvedAddr
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthUnresolvedAddr
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthUnresolvedAddr
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.AddressField = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipUnresolvedAddr(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthUnresolvedAddr
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func skipUnresolvedAddr(dAtA []byte) (n int, err error) {
        l := len(dAtA)
        iNdEx := 0
        depth := 0
        for iNdEx < l {
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return 0, ErrIntOverflowUnresolvedAddr
                        }
                        if iNdEx >= l {
                                return 0, io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= (uint64(b) & 0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                wireType := int(wire & 0x7)
                switch wireType {
                case 0:
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowUnresolvedAddr
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                iNdEx++
                                if dAtA[iNdEx-1] < 0x80 {
                                        break
                                }
                        }
                case 1:
                        iNdEx += 8
                case 2:
                        var length int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowUnresolvedAddr
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                length |= (int(b) & 0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if length < 0 {
                                return 0, ErrInvalidLengthUnresolvedAddr
                        }
                        iNdEx += length
                case 3:
                        depth++
                case 4:
                        if depth == 0 {
                                return 0, ErrUnexpectedEndOfGroupUnresolvedAddr
                        }
                        depth--
                case 5:
                        iNdEx += 4
                default:
                        return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
                }
                if iNdEx < 0 {
                        return 0, ErrInvalidLengthUnresolvedAddr
                }
                if depth == 0 {
                        return iNdEx, nil
                }
        }
        return 0, io.ErrUnexpectedEOF
}

var (
        ErrInvalidLengthUnresolvedAddr        = fmt.Errorf("proto: negative length found during unmarshaling")
        ErrIntOverflowUnresolvedAddr          = fmt.Errorf("proto: integer overflow")
        ErrUnexpectedEndOfGroupUnresolvedAddr = fmt.Errorf("proto: unexpected end of group")
)

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

// Copyright (C) 2013-2018 by Maxim Bublis <b@codemonkey.ru>
// Use of this source code is governed by a MIT-style
// license that can be found in licenses/MIT-gofrs.txt.

// This code originated in github.com/gofrs/uuid.

package uuid

import (
        "bytes"
        "encoding/hex"
        "fmt"
)

// FromBytes returns a UUID generated from the raw byte slice input.
// It will return an error if the slice isn't 16 bytes long.
func FromBytes(input []byte) (UUID, error) {
        u := UUID{}
        err := u.UnmarshalBinary(input)
        return u, err
}

// FromBytesOrNil returns a UUID generated from the raw byte slice input.
// Same behavior as FromBytes(), but returns uuid.Nil instead of an error.
func FromBytesOrNil(input []byte) UUID {
        uuid, err := FromBytes(input)
        if err != nil {
                return Nil
        }
        return uuid
}

// FromString returns a UUID parsed from the input string.
// Input is expected in a form accepted by UnmarshalText.
func FromString(input string) (UUID, error) {
        u := UUID{}
        err := u.UnmarshalText([]byte(input))
        return u, err
}

// FromStringOrNil returns a UUID parsed from the input string.
// Same behavior as FromString(), but returns uuid.Nil instead of an error.
func FromStringOrNil(input string) UUID {
        uuid, err := FromString(input)
        if err != nil {
                return Nil
        }
        return uuid
}

// MarshalText implements the encoding.TextMarshaler interface.
// The encoding is the same as returned by the String() method.
func (u UUID) MarshalText() ([]byte, error) {
        return []byte(u.String()), nil
}

// UnmarshalText implements the encoding.TextUnmarshaler interface.
// Following formats are supported:
//
//          "6ba7b810-9dad-11d1-80b4-00c04fd430c8",
//          "{6ba7b810-9dad-11d1-80b4-00c04fd430c8}",
//          "urn:uuid:6ba7b810-9dad-11d1-80b4-00c04fd430c8"
//          "6ba7b8109dad11d180b400c04fd430c8"
//          "{6ba7b8109dad11d180b400c04fd430c8}",
//          "{6ba7b810-9dad-11d1-80b4-00c04fd430c8}",
//          "urn:uuid:6ba7b8109dad11d180b400c04fd430c8",
//          "urn:uuid:6ba7b810-9dad-11d1-80b4-00c04fd430c8",
//                 "6ba7-b810-9dad-11d1-80b4-00c0-4fd4-30c8"
//
// ABNF for supported UUID text representation follows:
//
//        URN := 'urn'
//        UUID-NID := 'uuid'
//
//        hexdig := '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |
//                  'a' | 'b' | 'c' | 'd' | 'e' | 'f' |
//                  'A' | 'B' | 'C' | 'D' | 'E' | 'F'
//
//        hexoct := hexdig hexdig
//        2hexoct := hexoct hexoct
//        4hexoct := 2hexoct 2hexoct
//        6hexoct := 4hexoct 2hexoct
//        12hexoct := 6hexoct 6hexoct
//
//        hashlike := 12hexoct
//        hyphenated := hyphen after any group of 4 hexdig
//        Ex.6ba7-b810-9dad-11d1-80b4-00c0-4fd4-30c8
//        Ex.6ba7-b810-9dad11d1-80b400c0-4fd4-30c8
//
//        uuid := hyphenated | hashlike | braced | urn
//
//        braced := '{' hyphenated '}' | '{' hashlike  '}'
//        urn := URN ':' UUID-NID ':' hyphenated
func (u *UUID) UnmarshalText(text []byte) error {
        l := len(text)
        stringifiedText := string(text)

        if l < 32 || l > 48 {
                return fmt.Errorf("uuid: incorrect UUID length: %s", text)
        } else if stringifiedText[0] == '{' && stringifiedText[l-1] == '}' {
                return u.decodeHyphenated(text[1 : l-1])
        } else if bytes.Equal(text[:9], urnPrefix) {
                return u.decodeHyphenated(text[9:l])
        } else {
                return u.decodeHyphenated(text)
        }
}

// decodeHashLike decodes UUID strings that are using the following format:
//
//        "6ba7b8109dad11d180b400c04fd430c8".
func (u *UUID) decodeHashLike(t []byte) error {
        src := t[:]
        dst := u[:]

        _, err := hex.Decode(dst, src)
        return err
}

// decodeHyphenated decodes UUID strings that are using the following format:
//
//        "6ba7-b810-9dad-11d1-80b4-00c0-4fd4-30c8"
//        "6ba7b810-9dad-11d1-80b400c0-4fd4-30c8"
func (u *UUID) decodeHyphenated(t []byte) error {
        l := len(t)
        if l < 32 || l > 40 {
                return fmt.Errorf("uuid: incorrect UUID format: %s", t)
        }

        hashLike := make([]byte, 32)
        countSinceHyphen := 0
        i := 0
        for _, c := range t {
                if i >= len(hashLike) {
                        return fmt.Errorf("uuid: incorrect UUID format: %s", t)
                }
                if c == '-' {
                        if countSinceHyphen == 0 || countSinceHyphen%4 != 0 {
                                return fmt.Errorf("uuid: incorrect UUID format: %s", t)
                        }
                        countSinceHyphen = 0
                        continue
                }
                hashLike[i] = c
                i++
                countSinceHyphen++
        }
        if i != len(hashLike) {
                return fmt.Errorf("uuid: incorrect UUID format: %s", t)
        }
        return u.decodeHashLike(hashLike)
}

// MarshalBinary implements the encoding.BinaryMarshaler interface.
func (u UUID) MarshalBinary() ([]byte, error) {
        return u.bytes(), nil
}

// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
// It will return an error if the slice isn't 16 bytes long.
func (u *UUID) UnmarshalBinary(data []byte) error {
        if len(data) != Size {
                return fmt.Errorf("uuid: UUID must be exactly 16 bytes long, got %d bytes", len(data))
        }
        copy(u[:], data)

        return nil
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

// Copyright (c) 2018 Andrei Tudor Călin <mail@acln.ro>
// Use of this source code is governed by a MIT-style
// license that can be found in licenses/MIT-gofrs.txt.

// This code originated in github.com/gofrs/uuid.

//go:build gofuzz

package uuid

// Fuzz implements a simple fuzz test for FromString / UnmarshalText.
//
// To run:
//
//        $ go get github.com/dvyukov/go-fuzz/...
//        $ cd $GOPATH/src/github.com/gofrs/uuid
//        $ go-fuzz-build github.com/gofrs/uuid
//        $ go-fuzz -bin=uuid-fuzz.zip -workdir=./testdata
//
// If you make significant changes to FromString / UnmarshalText and add
// new cases to fromStringTests (in codec_test.go), please run
//
//        $ go test -seed_fuzz_corpus
//
// to seed the corpus with the new interesting inputs, then run the fuzzer.
func Fuzz(data []byte) int {
        _, err := FromString(string(data))
        if err != nil {
                return 0
        }
        return 1
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

// Copyright (C) 2013-2018 by Maxim Bublis <b@codemonkey.ru>
// Use of this source code is governed by a MIT-style
// license that can be found in licenses/MIT-gofrs.txt.

// This code originated in github.com/gofrs/uuid.

package uuid

import (
        "crypto/md5"
        "crypto/sha1"
        "encoding/binary"
        "fmt"
        "hash"
        math_rand "math/rand/v2"
        "net"
        "sync"
        "time"

        "github.com/cockroachdb/cockroach/pkg/util/syncutil"
        "github.com/cockroachdb/errors"
)

// Difference in 100-nanosecond intervals between
// UUID epoch (October 15, 1582) and Unix epoch (January 1, 1970).
const epochStart = 122192928000000000

type epochFunc func() time.Time

// HWAddrFunc is the function type used to provide hardware (MAC) addresses.
type HWAddrFunc func() (net.HardwareAddr, error)

// DefaultGenerator is the default UUID Generator used by this package.
// It uses math/rand as the source of entropy, which is backed by the
// cryptographically random ChaCha8 algorithm.
// See https://go.dev/blog/chacha8rand.
var DefaultGenerator Generator = NewGen()

// NewV1 returns a UUID based on the current timestamp and MAC address.
func NewV1() (UUID, error) {
        return DefaultGenerator.NewV1()
}

// NewV3 returns a UUID based on the MD5 hash of the namespace UUID and name.
func NewV3(ns UUID, name string) UUID {
        return DefaultGenerator.NewV3(ns, name)
}

// NewV4 returns a randomly generated UUID.
func NewV4() UUID {
        return DefaultGenerator.NewV4()
}

// NewV5 returns a UUID based on SHA-1 hash of the namespace UUID and name.
func NewV5(ns UUID, name string) UUID {
        return DefaultGenerator.NewV5(ns, name)
}

// Generator provides an interface for generating UUIDs.
type Generator interface {
        NewV1() (UUID, error)
        // NewV2(domain byte) (UUID, error) // CRL: Removed support for V2.
        NewV3(ns UUID, name string) UUID
        NewV4() UUID
        NewV5(ns UUID, name string) UUID
}

// Gen is a reference UUID generator based on the specifications laid out in
// RFC-4122 and DCE 1.1: Authentication and Security Services. This type
// satisfies the Generator interface as defined in this package.
//
// For consumers who are generating V1 UUIDs, but don't want to expose the MAC
// address of the node generating the UUIDs, the NewGenWithHWAF() function has been
// provided as a convenience. See the function's documentation for more info.
//
// The authors of this package do not feel that the majority of users will need
// to obfuscate their MAC address, and so we recommend using NewGen() to create
// a new generator.
type Gen struct {
        clockSequenceOnce sync.Once
        hardwareAddrOnce  sync.Once
        storageMutex      syncutil.Mutex

        // randUint64 is the function used to generate random uint64 values. The
        // function is stored directly to avoid the overhead of interface dispatch.
        randUint64 func() uint64

        epochFunc     epochFunc
        hwAddrFunc    HWAddrFunc
        lastTime      uint64
        clockSequence uint16
        hardwareAddr  [6]byte
}

// interface check -- build will fail if *Gen doesn't satisfy Generator
var _ Generator = (*Gen)(nil)

// NewGen returns a new instance of Gen with some default values set. Most
// people should use this.
// NewGen by default uses crypto/rand.Reader as its source of randomness.
func NewGen() *Gen {
        return NewGenWithHWAF(defaultHWAddrFunc)
}

// NewGenWithRand returns a new instance of gen which uses randUint64 as its
// source of randomness.
func NewGenWithRand(randUint64 func() uint64) *Gen {
        g := NewGen()
        g.randUint64 = randUint64
        return g
}

// NewGenWithHWAF builds a new UUID generator with the HWAddrFunc provided. Most
// consumers should use NewGen() instead.
//
// This is used so that consumers can generate their own MAC addresses, for use
// in the generated UUIDs, if there is some concern about exposing the physical
// address of the machine generating the UUID.
//
// The Gen generator will only invoke the HWAddrFunc once, and cache that MAC
// address for all the future UUIDs generated by it. If you'd like to switch the
// MAC address being used, you'll need to create a new generator using this
// function.
func NewGenWithHWAF(hwaf HWAddrFunc) *Gen {
        return &Gen{
                epochFunc:  time.Now,
                hwAddrFunc: hwaf,
                // "math/rand".Uint64 is safe for concurrent use. As of go1.22, the
                // math/rand (and math/rand/v2) package uses a cryptographically secure RNG.
                // See https://go.dev/blog/randv2 and https://go.dev/blog/chacha8rand.
                randUint64: math_rand.Uint64,
        }
}

// NewV1 returns a UUID based on the current timestamp and MAC address.
func (g *Gen) NewV1() (UUID, error) {
        u := UUID{}

        timeNow, clockSeq := g.getClockSequence()
        binary.BigEndian.PutUint32(u[0:], uint32(timeNow))
        binary.BigEndian.PutUint16(u[4:], uint16(timeNow>>32))
        binary.BigEndian.PutUint16(u[6:], uint16(timeNow>>48))
        binary.BigEndian.PutUint16(u[8:], clockSeq)

        hardwareAddr, err := g.getHardwareAddr()
        if err != nil {
                return Nil, err
        }
        copy(u[10:], hardwareAddr)

        u.SetVersion(V1)
        u.SetVariant(VariantRFC4122)

        return u, nil
}

// NewV3 returns a UUID based on the MD5 hash of the namespace UUID and name.
func (g *Gen) NewV3(ns UUID, name string) UUID {
        u := newFromHash(md5.New(), ns, name)
        u.SetVersion(V3)
        u.SetVariant(VariantRFC4122)

        return u
}

// NewV4 returns a randomly generated UUID.
func (g *Gen) NewV4() UUID {
        u := UUID{}
        binary.BigEndian.PutUint64(u[:Size/2], g.randUint64())
        binary.BigEndian.PutUint64(u[Size/2:], g.randUint64())
        u.SetVersion(V4)
        u.SetVariant(VariantRFC4122)

        return u
}

// NewV5 returns a UUID based on SHA-1 hash of the namespace UUID and name.
func (g *Gen) NewV5(ns UUID, name string) UUID {
        u := newFromHash(sha1.New(), ns, name)
        u.SetVersion(V5)
        u.SetVariant(VariantRFC4122)

        return u
}

// Returns the epoch and clock sequence.
func (g *Gen) getClockSequence() (uint64, uint16) {
        g.clockSequenceOnce.Do(func() {
                buf := make([]byte, 8)
                binary.BigEndian.PutUint64(buf[:], g.randUint64())
                g.clockSequence = binary.BigEndian.Uint16(buf)
        })

        g.storageMutex.Lock()
        defer g.storageMutex.Unlock()

        timeNow := g.getEpoch()
        // Clock didn't change since last UUID generation.
        // Should increase clock sequence.
        if timeNow <= g.lastTime {
                g.clockSequence++
        }
        g.lastTime = timeNow

        return timeNow, g.clockSequence
}

// Returns the hardware address.
func (g *Gen) getHardwareAddr() ([]byte, error) {
        var err error
        g.hardwareAddrOnce.Do(func() {
                var hwAddr net.HardwareAddr
                if hwAddr, err = g.hwAddrFunc(); err == nil {
                        copy(g.hardwareAddr[:], hwAddr)
                        return
                }

                // Initialize hardwareAddr randomly in case of real network interfaces
                // absence.
                hwAddr, err = RandomHardwareAddrFunc()
                if err != nil {
                        panic("RandomHardwareAddrFunc does not return an error")
                }
                copy(g.hardwareAddr[:], hwAddr)
                // Set multicast bit as recommended by RFC-4122
                g.hardwareAddr[0] |= 0x01
        })
        if err != nil {
                return []byte{}, err
        }
        return g.hardwareAddr[:], nil
}

// Returns the difference between UUID epoch (October 15, 1582)
// and current time in 100-nanosecond intervals.
func (g *Gen) getEpoch() uint64 {
        return epochStart + uint64(g.epochFunc().UnixNano()/100)
}

// Returns the UUID based on the hashing of the namespace UUID and name.
func newFromHash(h hash.Hash, ns UUID, name string) UUID {
        u := UUID{}
        mustWrite := func(data []byte) {
                if _, err := h.Write(data); err != nil {
                        panic(errors.Wrap(err, "failed to write to hash"))
                }
        }
        mustWrite(ns[:])
        mustWrite([]byte(name))
        copy(u[:], h.Sum(nil))
        return u
}

// Returns the hardware address.
func defaultHWAddrFunc() (net.HardwareAddr, error) {
        ifaces, err := net.Interfaces()
        if err != nil {
                return []byte{}, err
        }
        for _, iface := range ifaces {
                if len(iface.HardwareAddr) >= 6 {
                        return iface.HardwareAddr, nil
                }
        }
        return []byte{}, fmt.Errorf("uuid: no HW address found")
}

// RandomHardwareAddrFunc returns a random hardware address, with the multicast
// and local-admin bits set as per the IEEE802 spec. This function never
// returns an error, but the signature has to match the HWAddrFunc type.
func RandomHardwareAddrFunc() (net.HardwareAddr, error) {
        var hardwareAddr = make([]byte, 8)
        binary.BigEndian.PutUint64(hardwareAddr, math_rand.Uint64())
        // Set multicast bit and local-admin bit to match Postgres.
        hardwareAddr[0] |= 0x03
        // Discard the last 2 bytes.
        return hardwareAddr[:6], nil
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

// Copyright (C) 2013-2018 by Maxim Bublis <b@codemonkey.ru>
// Use of this source code is governed by a MIT-style
// license that can be found in licenses/MIT-gofrs.txt.

// This code originated in github.com/gofrs/uuid.

package uuid

import (
        "bytes"
        "database/sql/driver"
        "encoding/json"
        "fmt"
)

// Value implements the driver.Valuer interface.
func (u UUID) Value() (driver.Value, error) {
        return u.String(), nil
}

// Scan implements the sql.Scanner interface.
// A 16-byte slice will be handled by UnmarshalBinary, while
// a longer byte slice or a string will be handled by UnmarshalText.
func (u *UUID) Scan(src interface{}) error {
        switch src := src.(type) {
        case UUID: // support gorm convert from UUID to NullUUID
                *u = src
                return nil

        case []byte:
                if len(src) == Size {
                        return u.UnmarshalBinary(src)
                }
                return u.UnmarshalText(src)

        case string:
                return u.UnmarshalText([]byte(src))
        }

        return fmt.Errorf("uuid: cannot convert %T to UUID", src)
}

// NullUUID can be used with the standard sql package to represent a
// UUID value that can be NULL in the database.
type NullUUID struct {
        UUID  UUID
        Valid bool
}

// Value implements the driver.Valuer interface.
func (u NullUUID) Value() (driver.Value, error) {
        if !u.Valid {
                return nil, nil
        }
        // Delegate to UUID Value function
        return u.UUID.Value()
}

// Scan implements the sql.Scanner interface.
func (u *NullUUID) Scan(src interface{}) error {
        if src == nil {
                u.UUID, u.Valid = Nil, false
                return nil
        }

        // Delegate to UUID Scan function
        u.Valid = true
        return u.UUID.Scan(src)
}

// MarshalJSON marshals the NullUUID as null or the nested UUID
func (u NullUUID) MarshalJSON() ([]byte, error) {
        if !u.Valid {
                return json.Marshal(nil)
        }

        return json.Marshal(u.UUID)
}

// UnmarshalJSON unmarshals a NullUUID
func (u *NullUUID) UnmarshalJSON(b []byte) error {
        if bytes.Equal(b, []byte("null")) {
                u.UUID, u.Valid = Nil, false
                return nil
        }

        if err := json.Unmarshal(b, &u.UUID); err != nil {
                return err
        }

        u.Valid = true

        return nil
}

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

// Copyright (C) 2013-2018 by Maxim Bublis <b@codemonkey.ru>
// Use of this source code is governed by a MIT-style
// license that can be found in licenses/MIT-gofrs.txt.

// This code originated in github.com/gofrs/uuid.

package uuid

import (
        "encoding/binary"
        "encoding/hex"
        "fmt"
        "math"
        "time"

        "github.com/cockroachdb/cockroach/pkg/util/timeutil"
        "github.com/cockroachdb/cockroach/pkg/util/uint128"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/redact"
)

// Size of a UUID in bytes.
const Size = 16

// RFC4122StrSize of a size of the RFC-4122 string representation of UUID.
const RFC4122StrSize = 36

// Bytes represents a byte slice which is intended to be interpreted as a binary
// encoding of a UUID.
type Bytes []byte

var _ redact.SafeValue = Bytes{}

// SafeValue implements the redact.SafeValue interface.
func (b Bytes) SafeValue() {}

// GetUUID constructs a UUID from the bytes. If the data is not valid, a zero
// value will be returned.
func (b Bytes) GetUUID() UUID { return FromBytesOrNil(b) }

// String returns the string representation of the underlying UUID.
func (b Bytes) String() string {
        return b.GetUUID().String()
}

// UUID is an array type to represent the value of a UUID, as defined in RFC-4122.
type UUID [Size]byte

var _ redact.SafeValue = UUID{}

// SafeValue implements the redact.SafeValue interface.
func (u UUID) SafeValue() {}

// UUID versions.
const (
        _  byte = iota
        V1      // Version 1 (date-time and MAC address)
        _       // Version 2 (date-time and MAC address, DCE security version)
        V3      // Version 3 (namespace name-based)
        V4      // Version 4 (random)
        V5      // Version 5 (namespace name-based)
)

// UUID layout variants.
const (
        VariantNCS byte = iota
        VariantRFC4122
        VariantMicrosoft
        VariantFuture
)

// Timestamp is the count of 100-nanosecond intervals since 00:00:00.00,
// 15 October 1582 within a V1 UUID. This type has no meaning for V2-V5
// UUIDs since they don't have an embedded timestamp.
type Timestamp uint64

var _ redact.SafeValue = Timestamp(0)

// SafeValue implements the redact.SafeValue interface.
func (t Timestamp) SafeValue() {}

const _100nsPerSecond = 10000000

// Time returns the UTC time.Time representation of a Timestamp
func (t Timestamp) Time() (time.Time, error) {
        secs := uint64(t) / _100nsPerSecond
        nsecs := 100 * (uint64(t) % _100nsPerSecond)
        return timeutil.Unix(int64(secs)-(epochStart/_100nsPerSecond), int64(nsecs)), nil
}

// TimestampFromV1 returns the Timestamp embedded within a V1 UUID.
// Returns an error if the UUID is any version other than 1.
func TimestampFromV1(u UUID) (Timestamp, error) {
        if u.Version() != 1 {
                err := fmt.Errorf("uuid: %s is version %d, not version 1", u, u.Version())
                return 0, err
        }
        low := binary.BigEndian.Uint32(u[0:4])
        mid := binary.BigEndian.Uint16(u[4:6])
        hi := binary.BigEndian.Uint16(u[6:8]) & 0xfff
        return Timestamp(uint64(low) + (uint64(mid) << 32) + (uint64(hi) << 48)), nil
}

// String parse helpers.
var urnPrefix = []byte("urn:uuid:")

// Nil is the nil UUID, as specified in RFC-4122, which has all 128 bits set to
// zero.
var Nil = UUID{}

// Max is the maximum possible UUID, which has all 128 bits set to 1.
var Max = FromUint128(uint128.FromInts(math.MaxUint64, math.MaxUint64))

// Predefined namespace UUIDs.
var (
        NamespaceDNS  = Must(FromString("6ba7b810-9dad-11d1-80b4-00c04fd430c8"))
        NamespaceURL  = Must(FromString("6ba7b811-9dad-11d1-80b4-00c04fd430c8"))
        NamespaceOID  = Must(FromString("6ba7b812-9dad-11d1-80b4-00c04fd430c8"))
        NamespaceX500 = Must(FromString("6ba7b814-9dad-11d1-80b4-00c04fd430c8"))
)

// Version returns the algorithm version used to generate the UUID.
func (u UUID) Version() byte {
        return u[6] >> 4
}

// Variant returns the UUID layout variant.
func (u UUID) Variant() byte {
        switch {
        case (u[8] >> 7) == 0x00:
                return VariantNCS
        case (u[8] >> 6) == 0x02:
                return VariantRFC4122
        case (u[8] >> 5) == 0x06:
                return VariantMicrosoft
        case (u[8] >> 5) == 0x07:
                fallthrough
        default:
                return VariantFuture
        }
}

// bytes returns a byte slice representation of the UUID. It incurs an
// allocation if the return value escapes.
func (u UUID) bytes() []byte {
        return u[:]
}

// bytesMut returns a mutable byte slice representation of the UUID. Unlike
// bytes, it does not necessarily incur an allocation if the return value
// escapes. Instead, the return value escaping will cause the method's receiver
// (and any struct that it is a part of) to escape.
func (u *UUID) bytesMut() []byte {
        return u[:]
}

// String returns a canonical RFC-4122 string representation of the UUID:
// xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx.
func (u UUID) String() string {
        buf := make([]byte, RFC4122StrSize)
        u.StringBytes(buf)
        return string(buf)
}

// StringBytes writes the result of String directly into a buffer, which must
// have a length of at least 36.
func (u UUID) StringBytes(buf []byte) {
        _ = buf[:RFC4122StrSize]
        hex.Encode(buf[0:8], u[0:4])
        buf[8] = '-'
        hex.Encode(buf[9:13], u[4:6])
        buf[13] = '-'
        hex.Encode(buf[14:18], u[6:8])
        buf[18] = '-'
        hex.Encode(buf[19:23], u[8:10])
        buf[23] = '-'
        hex.Encode(buf[24:], u[10:])
}

// SetVersion sets the version bits.
func (u *UUID) SetVersion(v byte) {
        u[6] = (u[6] & 0x0f) | (v << 4)
}

// SetVariant sets the variant bits.
func (u *UUID) SetVariant(v byte) {
        switch v {
        case VariantNCS:
                u[8] = (u[8]&(0xff>>1) | (0x00 << 7))
        case VariantRFC4122:
                u[8] = (u[8]&(0xff>>2) | (0x02 << 6))
        case VariantMicrosoft:
                u[8] = (u[8]&(0xff>>3) | (0x06 << 5))
        case VariantFuture:
                fallthrough
        default:
                u[8] = (u[8]&(0xff>>3) | (0x07 << 5))
        }
}

// Must is a helper that wraps a call to a function returning (UUID, error)
// and panics if the error is non-nil. It is intended for use in variable
// initializations such as
//
//        var packageUUID = uuid.Must(uuid.FromString("123e4567-e89b-12d3-a456-426655440000"))
func Must(u UUID, err error) UUID {
        if err != nil {
                panic(err)
        }
        return u
}

// DeterministicV4 overwrites this UUID with one computed deterministically to
// evenly fill the space of possible V4 UUIDs. `n` represents how many UUIDs
// will fill the space and `i` is an index into these `n` (and thus must be in
// the range `[0,n)`). The resulting UUIDs will be unique, evenly-spaced, and
// sorted.
func (u *UUID) DeterministicV4(i, n uint64) {
        if i >= n {
                panic(errors.Errorf(`i must be in [0,%d) was %d`, n, i))
        }
        // V4 uuids are generated by simply filling 16 bytes with random data (then
        // setting the version and variant), so they're randomly distributed through
        // the space of possible values. This also means they're roughly evenly
        // distributed. We guarantee these values to be similarly distributed.
        //
        // So, space the row indexes out to fill the space of the integers
        // representable with 8 bytes. Then, because this involves some floats (and
        // who knows what kind of crazy rounding things can happen when floats are
        // involved), make sure they're unique by sticking the index
        // in the lower 8 bytes. Note that we need to use BigEndian encodings to keep
        // the uuids sorted in the same order as the ints.
        spacing := uint64(float64(i) * float64(math.MaxUint64) / float64(n))
        binary.BigEndian.PutUint64(u[0:8], spacing)
        binary.BigEndian.PutUint64(u[8:16], i)
        u.SetVersion(V4)
        u.SetVariant(VariantRFC4122)
}

// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package uuid

import (
        "encoding/binary"
        "encoding/hex"
        "encoding/json"
        "fmt"

        "github.com/cockroachdb/cockroach/pkg/util/uint128"
        "github.com/cockroachdb/errors"
        "github.com/cockroachdb/redact"
)

const (
        shortSize    = 4
        shortStrSize = 8
)

type Short struct {
        b [shortSize]byte
}

var _ redact.SafeValue = Short{}

// SafeValue implements the redact.SafeValue interface.
func (s Short) SafeValue() {}

// String returns the 8-character hexidecimal representation of the abbreviated
// UUID.
func (s Short) String() string {
        var b [shortStrSize]byte
        hex.Encode(b[:], s.b[:])
        return string(b[:])
}

// Short returns an abbreviated version of the UUID containing the first four
// bytes.
func (u UUID) Short() Short {
        return Short{
                b: [shortSize]byte(u[0:shortSize]),
        }
}

// ShortStringer implements fmt.Stringer to output Short() on String().
type ShortStringer UUID

// String is part of fmt.Stringer.
func (s ShortStringer) String() string {
        return UUID(s).Short().String()
}

var _ fmt.Stringer = ShortStringer{}

// Equal returns true iff the receiver equals the argument.
//
// This method exists only to conform to the API expected by gogoproto's
// generated Equal implementations.
func (u UUID) Equal(t UUID) bool {
        return u == t
}

// GetBytes returns the UUID as a byte slice. It incurs an allocation if
// the return value escapes.
func (u UUID) GetBytes() []byte {
        return u.bytes()
}

// GetBytesMut returns the UUID as a mutable byte slice. Unlike GetBytes,
// it does not necessarily incur an allocation if the return value escapes.
// Instead, the return value escaping will cause the method's receiver (and
// any struct that it is a part of) to escape. Use only if GetBytes is causing
// an allocation and the UUID is already on the heap.
func (u *UUID) GetBytesMut() []byte {
        return u.bytesMut()
}

// ToUint128 returns the UUID as a Uint128.
func (u UUID) ToUint128() uint128.Uint128 {
        return uint128.FromBytes(u.bytes())
}

// Size returns the marshaled size of u, in bytes.
func (u UUID) Size() int {
        return len(u)
}

// MarshalTo marshals u to data.
func (u UUID) MarshalTo(data []byte) (int, error) {
        return copy(data, u.GetBytes()), nil
}

// Unmarshal unmarshals data to u.
func (u *UUID) Unmarshal(data []byte) error {
        return u.UnmarshalBinary(data)
}

// MarshalJSON returns the JSON encoding of u.
func (u UUID) MarshalJSON() ([]byte, error) {
        return json.Marshal(u.String())
}

// UnmarshalJSON unmarshals the JSON encoded data into u.
func (u *UUID) UnmarshalJSON(data []byte) error {
        var uuidString string
        if err := json.Unmarshal(data, &uuidString); err != nil {
                return err
        }
        uuid, err := FromString(uuidString)
        *u = uuid
        return err
}

// MakeV4 calls NewV4.
func MakeV4() UUID {
        return NewV4()
}

// NewPopulatedUUID returns a populated UUID.
func NewPopulatedUUID(r interface {
        Uint32() uint32
}) *UUID {
        var u UUID
        binary.LittleEndian.PutUint32(u[:4], r.Uint32())
        binary.LittleEndian.PutUint32(u[4:8], r.Uint32())
        binary.LittleEndian.PutUint32(u[8:12], r.Uint32())
        binary.LittleEndian.PutUint32(u[12:], r.Uint32())
        return &u
}

// FromUint128 delegates to FromBytes and wraps the result in a UUID.
func FromUint128(input uint128.Uint128) UUID {
        u, err := FromBytes(input.GetBytes())
        if err != nil {
                panic(errors.Wrap(err, "should never happen with 16 byte slice"))
        }
        return u
}