config: Go Coverage Report

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package config

import (
        "errors"
        "fmt"
        "net/url"
        "os"
        "path/filepath"
        "sort"
        "strconv"
        "strings"
        "time"

        "github.com/alecthomas/units"
        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/grafana/regexp"
        "github.com/prometheus/common/config"
        "github.com/prometheus/common/model"
        "github.com/prometheus/common/sigv4"
        "gopkg.in/yaml.v2"

        "github.com/prometheus/prometheus/discovery"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/relabel"
        "github.com/prometheus/prometheus/storage/remote/azuread"
)

var (
        patRulePath     = regexp.MustCompile(`^[^*]*(\*[^/]*)?$`)
        reservedHeaders = map[string]struct{}{
                // NOTE: authorization is checked specially,
                // see RemoteWriteConfig.UnmarshalYAML.
                // "authorization":                  {},
                "host":                              {},
                "content-encoding":                  {},
                "content-length":                    {},
                "content-type":                      {},
                "user-agent":                        {},
                "connection":                        {},
                "keep-alive":                        {},
                "proxy-authenticate":                {},
                "proxy-authorization":               {},
                "www-authenticate":                  {},
                "accept-encoding":                   {},
                "x-prometheus-remote-write-version": {},
                "x-prometheus-remote-read-version":  {},

                // Added by SigV4.
                "x-amz-date":           {},
                "x-amz-security-token": {},
                "x-amz-content-sha256": {},
        }
)

// Load parses the YAML input s into a Config.
func Load(s string, expandExternalLabels bool, logger log.Logger) (*Config, error) {
        cfg := &Config{}
        // If the entire config body is empty the UnmarshalYAML method is
        // never called. We thus have to set the DefaultConfig at the entry
        // point as well.
        *cfg = DefaultConfig

        err := yaml.UnmarshalStrict([]byte(s), cfg)
        if err != nil {
                return nil, err
        }

        if !expandExternalLabels {
                return cfg, nil
        }

        b := labels.NewScratchBuilder(0)
        cfg.GlobalConfig.ExternalLabels.Range(func(v labels.Label) {
                newV := os.Expand(v.Value, func(s string) string {
                        if s == "$" {
                                return "$"
                        }
                        if v := os.Getenv(s); v != "" {
                                return v
                        }
                        level.Warn(logger).Log("msg", "Empty environment variable", "name", s)
                        return ""
                })
                if newV != v.Value {
                        level.Debug(logger).Log("msg", "External label replaced", "label", v.Name, "input", v.Value, "output", newV)
                }
                // Note newV can be blank. https://github.com/prometheus/prometheus/issues/11024
                b.Add(v.Name, newV)
        })
        cfg.GlobalConfig.ExternalLabels = b.Labels()
        return cfg, nil
}

// LoadFile parses the given YAML file into a Config.
func LoadFile(filename string, agentMode, expandExternalLabels bool, logger log.Logger) (*Config, error) {
        content, err := os.ReadFile(filename)
        if err != nil {
                return nil, err
        }
        cfg, err := Load(string(content), expandExternalLabels, logger)
        if err != nil {
                return nil, fmt.Errorf("parsing YAML file %s: %w", filename, err)
        }

        if agentMode {
                if len(cfg.AlertingConfig.AlertmanagerConfigs) > 0 || len(cfg.AlertingConfig.AlertRelabelConfigs) > 0 {
                        return nil, errors.New("field alerting is not allowed in agent mode")
                }

                if len(cfg.RuleFiles) > 0 {
                        return nil, errors.New("field rule_files is not allowed in agent mode")
                }

                if len(cfg.RemoteReadConfigs) > 0 {
                        return nil, errors.New("field remote_read is not allowed in agent mode")
                }
        }

        cfg.SetDirectory(filepath.Dir(filename))
        return cfg, nil
}

// The defaults applied before parsing the respective config sections.
var (
        // DefaultConfig is the default top-level configuration.
        DefaultConfig = Config{
                GlobalConfig: DefaultGlobalConfig,
        }

        // DefaultGlobalConfig is the default global configuration.
        DefaultGlobalConfig = GlobalConfig{
                ScrapeInterval:     model.Duration(1 * time.Minute),
                ScrapeTimeout:      model.Duration(10 * time.Second),
                EvaluationInterval: model.Duration(1 * time.Minute),
                RuleQueryOffset:    model.Duration(0 * time.Minute),
                // When native histogram feature flag is enabled, ScrapeProtocols default
                // changes to DefaultNativeHistogramScrapeProtocols.
                ScrapeProtocols: DefaultScrapeProtocols,
        }

        DefaultRuntimeConfig = RuntimeConfig{
                // Go runtime tuning.
                GoGC: 75,
        }

        // DefaultScrapeConfig is the default scrape configuration.
        DefaultScrapeConfig = ScrapeConfig{
                // ScrapeTimeout, ScrapeInterval and ScrapeProtocols default to the configured globals.
                ScrapeClassicHistograms: false,
                MetricsPath:             "/metrics",
                Scheme:                  "http",
                HonorLabels:             false,
                HonorTimestamps:         true,
                HTTPClientConfig:        config.DefaultHTTPClientConfig,
                EnableCompression:       true,
        }

        // DefaultAlertmanagerConfig is the default alertmanager configuration.
        DefaultAlertmanagerConfig = AlertmanagerConfig{
                Scheme:           "http",
                Timeout:          model.Duration(10 * time.Second),
                APIVersion:       AlertmanagerAPIVersionV2,
                HTTPClientConfig: config.DefaultHTTPClientConfig,
        }

        // DefaultRemoteWriteConfig is the default remote write configuration.
        DefaultRemoteWriteConfig = RemoteWriteConfig{
                RemoteTimeout:    model.Duration(30 * time.Second),
                QueueConfig:      DefaultQueueConfig,
                MetadataConfig:   DefaultMetadataConfig,
                HTTPClientConfig: config.DefaultHTTPClientConfig,
        }

        // DefaultQueueConfig is the default remote queue configuration.
        DefaultQueueConfig = QueueConfig{
                // With a maximum of 50 shards, assuming an average of 100ms remote write
                // time and 2000 samples per batch, we will be able to push 1M samples/s.
                MaxShards:         50,
                MinShards:         1,
                MaxSamplesPerSend: 2000,

                // Each shard will have a max of 10,000 samples pending in its channel, plus the pending
                // samples that have been enqueued. Theoretically we should only ever have about 12,000 samples
                // per shard pending. At 50 shards that's 600k.
                Capacity:          10000,
                BatchSendDeadline: model.Duration(5 * time.Second),

                // Backoff times for retrying a batch of samples on recoverable errors.
                MinBackoff: model.Duration(30 * time.Millisecond),
                MaxBackoff: model.Duration(5 * time.Second),
        }

        // DefaultMetadataConfig is the default metadata configuration for a remote write endpoint.
        DefaultMetadataConfig = MetadataConfig{
                Send:              true,
                SendInterval:      model.Duration(1 * time.Minute),
                MaxSamplesPerSend: 2000,
        }

        // DefaultRemoteReadConfig is the default remote read configuration.
        DefaultRemoteReadConfig = RemoteReadConfig{
                RemoteTimeout:        model.Duration(1 * time.Minute),
                HTTPClientConfig:     config.DefaultHTTPClientConfig,
                FilterExternalLabels: true,
        }

        // DefaultStorageConfig is the default TSDB/Exemplar storage configuration.
        DefaultStorageConfig = StorageConfig{
                ExemplarsConfig: &DefaultExemplarsConfig,
        }

        DefaultExemplarsConfig = ExemplarsConfig{
                MaxExemplars: 100000,
        }
)

// Config is the top-level configuration for Prometheus's config files.
type Config struct {
        GlobalConfig      GlobalConfig    `yaml:"global"`
        Runtime           RuntimeConfig   `yaml:"runtime,omitempty"`
        AlertingConfig    AlertingConfig  `yaml:"alerting,omitempty"`
        RuleFiles         []string        `yaml:"rule_files,omitempty"`
        ScrapeConfigFiles []string        `yaml:"scrape_config_files,omitempty"`
        ScrapeConfigs     []*ScrapeConfig `yaml:"scrape_configs,omitempty"`
        StorageConfig     StorageConfig   `yaml:"storage,omitempty"`
        TracingConfig     TracingConfig   `yaml:"tracing,omitempty"`

        RemoteWriteConfigs []*RemoteWriteConfig `yaml:"remote_write,omitempty"`
        RemoteReadConfigs  []*RemoteReadConfig  `yaml:"remote_read,omitempty"`
}

// SetDirectory joins any relative file paths with dir.
func (c *Config) SetDirectory(dir string) {
        c.GlobalConfig.SetDirectory(dir)
        c.AlertingConfig.SetDirectory(dir)
        c.TracingConfig.SetDirectory(dir)
        for i, file := range c.RuleFiles {
                c.RuleFiles[i] = config.JoinDir(dir, file)
        }
        for i, file := range c.ScrapeConfigFiles {
                c.ScrapeConfigFiles[i] = config.JoinDir(dir, file)
        }
        for _, c := range c.ScrapeConfigs {
                c.SetDirectory(dir)
        }
        for _, c := range c.RemoteWriteConfigs {
                c.SetDirectory(dir)
        }
        for _, c := range c.RemoteReadConfigs {
                c.SetDirectory(dir)
        }
}

func (c Config) String() string {
        b, err := yaml.Marshal(c)
        if err != nil {
                return fmt.Sprintf("<error creating config string: %s>", err)
        }
        return string(b)
}

// GetScrapeConfigs returns the scrape configurations.
func (c *Config) GetScrapeConfigs() ([]*ScrapeConfig, error) {
        scfgs := make([]*ScrapeConfig, len(c.ScrapeConfigs))

        jobNames := map[string]string{}
        for i, scfg := range c.ScrapeConfigs {
                // We do these checks for library users that would not call Validate in
                // Unmarshal.
                if err := scfg.Validate(c.GlobalConfig); err != nil {
                        return nil, err
                }

                if _, ok := jobNames[scfg.JobName]; ok {
                        return nil, fmt.Errorf("found multiple scrape configs with job name %q", scfg.JobName)
                }
                jobNames[scfg.JobName] = "main config file"
                scfgs[i] = scfg
        }
        for _, pat := range c.ScrapeConfigFiles {
                fs, err := filepath.Glob(pat)
                if err != nil {
                        // The only error can be a bad pattern.
                        return nil, fmt.Errorf("error retrieving scrape config files for %q: %w", pat, err)
                }
                for _, filename := range fs {
                        cfg := ScrapeConfigs{}
                        content, err := os.ReadFile(filename)
                        if err != nil {
                                return nil, fileErr(filename, err)
                        }
                        err = yaml.UnmarshalStrict(content, &cfg)
                        if err != nil {
                                return nil, fileErr(filename, err)
                        }
                        for _, scfg := range cfg.ScrapeConfigs {
                                if err := scfg.Validate(c.GlobalConfig); err != nil {
                                        return nil, fileErr(filename, err)
                                }

                                if f, ok := jobNames[scfg.JobName]; ok {
                                        return nil, fileErr(filename, fmt.Errorf("found multiple scrape configs with job name %q, first found in %s", scfg.JobName, f))
                                }
                                jobNames[scfg.JobName] = fmt.Sprintf("%q", filePath(filename))

                                scfg.SetDirectory(filepath.Dir(filename))
                                scfgs = append(scfgs, scfg)
                        }
                }
        }
        return scfgs, nil
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *Config) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *c = DefaultConfig
        // We want to set c to the defaults and then overwrite it with the input.
        // To make unmarshal fill the plain data struct rather than calling UnmarshalYAML
        // again, we have to hide it using a type indirection.
        type plain Config
        if err := unmarshal((*plain)(c)); err != nil {
                return err
        }

        // If a global block was open but empty the default global config is overwritten.
        // We have to restore it here.
        if c.GlobalConfig.isZero() {
                c.GlobalConfig = DefaultGlobalConfig
        }

        // If a runtime block was open but empty the default runtime config is overwritten.
        // We have to restore it here.
        if c.Runtime.isZero() {
                c.Runtime = DefaultRuntimeConfig
                // Use the GOGC env var value if the runtime section is empty.
                c.Runtime.GoGC = getGoGCEnv()
        }

        for _, rf := range c.RuleFiles {
                if !patRulePath.MatchString(rf) {
                        return fmt.Errorf("invalid rule file path %q", rf)
                }
        }

        for _, sf := range c.ScrapeConfigFiles {
                if !patRulePath.MatchString(sf) {
                        return fmt.Errorf("invalid scrape config file path %q", sf)
                }
        }

        // Do global overrides and validate unique names.
        jobNames := map[string]struct{}{}
        for _, scfg := range c.ScrapeConfigs {
                if err := scfg.Validate(c.GlobalConfig); err != nil {
                        return err
                }

                if _, ok := jobNames[scfg.JobName]; ok {
                        return fmt.Errorf("found multiple scrape configs with job name %q", scfg.JobName)
                }
                jobNames[scfg.JobName] = struct{}{}
        }
        rwNames := map[string]struct{}{}
        for _, rwcfg := range c.RemoteWriteConfigs {
                if rwcfg == nil {
                        return errors.New("empty or null remote write config section")
                }
                // Skip empty names, we fill their name with their config hash in remote write code.
                if _, ok := rwNames[rwcfg.Name]; ok && rwcfg.Name != "" {
                        return fmt.Errorf("found multiple remote write configs with job name %q", rwcfg.Name)
                }
                rwNames[rwcfg.Name] = struct{}{}
        }
        rrNames := map[string]struct{}{}
        for _, rrcfg := range c.RemoteReadConfigs {
                if rrcfg == nil {
                        return errors.New("empty or null remote read config section")
                }
                // Skip empty names, we fill their name with their config hash in remote read code.
                if _, ok := rrNames[rrcfg.Name]; ok && rrcfg.Name != "" {
                        return fmt.Errorf("found multiple remote read configs with job name %q", rrcfg.Name)
                }
                rrNames[rrcfg.Name] = struct{}{}
        }
        return nil
}

// GlobalConfig configures values that are used across other configuration
// objects.
type GlobalConfig struct {
        // How frequently to scrape targets by default.
        ScrapeInterval model.Duration `yaml:"scrape_interval,omitempty"`
        // The default timeout when scraping targets.
        ScrapeTimeout model.Duration `yaml:"scrape_timeout,omitempty"`
        // The protocols to negotiate during a scrape. It tells clients what
        // protocol are accepted by Prometheus and with what weight (most wanted is first).
        // Supported values (case sensitive): PrometheusProto, OpenMetricsText0.0.1,
        // OpenMetricsText1.0.0, PrometheusText0.0.4.
        ScrapeProtocols []ScrapeProtocol `yaml:"scrape_protocols,omitempty"`
        // How frequently to evaluate rules by default.
        EvaluationInterval model.Duration `yaml:"evaluation_interval,omitempty"`
        // Offset the rule evaluation timestamp of this particular group by the specified duration into the past to ensure the underlying metrics have been received.
        RuleQueryOffset model.Duration `yaml:"rule_query_offset,omitempty"`
        // File to which PromQL queries are logged.
        QueryLogFile string `yaml:"query_log_file,omitempty"`
        // The labels to add to any timeseries that this Prometheus instance scrapes.
        ExternalLabels labels.Labels `yaml:"external_labels,omitempty"`
        // An uncompressed response body larger than this many bytes will cause the
        // scrape to fail. 0 means no limit.
        BodySizeLimit units.Base2Bytes `yaml:"body_size_limit,omitempty"`
        // More than this many samples post metric-relabeling will cause the scrape to
        // fail. 0 means no limit.
        SampleLimit uint `yaml:"sample_limit,omitempty"`
        // More than this many targets after the target relabeling will cause the
        // scrapes to fail. 0 means no limit.
        TargetLimit uint `yaml:"target_limit,omitempty"`
        // More than this many labels post metric-relabeling will cause the scrape to
        // fail. 0 means no limit.
        LabelLimit uint `yaml:"label_limit,omitempty"`
        // More than this label name length post metric-relabeling will cause the
        // scrape to fail. 0 means no limit.
        LabelNameLengthLimit uint `yaml:"label_name_length_limit,omitempty"`
        // More than this label value length post metric-relabeling will cause the
        // scrape to fail. 0 means no limit.
        LabelValueLengthLimit uint `yaml:"label_value_length_limit,omitempty"`
        // Keep no more than this many dropped targets per job.
        // 0 means no limit.
        KeepDroppedTargets uint `yaml:"keep_dropped_targets,omitempty"`
}

// ScrapeProtocol represents supported protocol for scraping metrics.
type ScrapeProtocol string

// Validate returns error if given scrape protocol is not supported.
func (s ScrapeProtocol) Validate() error {
        if _, ok := ScrapeProtocolsHeaders[s]; !ok {
                return fmt.Errorf("unknown scrape protocol %v, supported: %v",
                        s, func() (ret []string) {
                                for k := range ScrapeProtocolsHeaders {
                                        ret = append(ret, string(k))
                                }
                                sort.Strings(ret)
                                return ret
                        }())
        }
        return nil
}

var (
        PrometheusProto      ScrapeProtocol = "PrometheusProto"
        PrometheusText0_0_4  ScrapeProtocol = "PrometheusText0.0.4"
        OpenMetricsText0_0_1 ScrapeProtocol = "OpenMetricsText0.0.1"
        OpenMetricsText1_0_0 ScrapeProtocol = "OpenMetricsText1.0.0"

        ScrapeProtocolsHeaders = map[ScrapeProtocol]string{
                PrometheusProto:      "application/vnd.google.protobuf;proto=io.prometheus.client.MetricFamily;encoding=delimited",
                PrometheusText0_0_4:  "text/plain;version=0.0.4",
                OpenMetricsText0_0_1: "application/openmetrics-text;version=0.0.1",
                OpenMetricsText1_0_0: "application/openmetrics-text;version=1.0.0",
        }

        // DefaultScrapeProtocols is the set of scrape protocols that will be proposed
        // to scrape target, ordered by priority.
        DefaultScrapeProtocols = []ScrapeProtocol{
                OpenMetricsText1_0_0,
                OpenMetricsText0_0_1,
                PrometheusText0_0_4,
        }

        // DefaultProtoFirstScrapeProtocols is like DefaultScrapeProtocols, but it
        // favors protobuf Prometheus exposition format.
        // Used by default for certain feature-flags like
        // "native-histograms" and "created-timestamp-zero-ingestion".
        DefaultProtoFirstScrapeProtocols = []ScrapeProtocol{
                PrometheusProto,
                OpenMetricsText1_0_0,
                OpenMetricsText0_0_1,
                PrometheusText0_0_4,
        }
)

// validateAcceptScrapeProtocols return errors if we see problems with accept scrape protocols option.
func validateAcceptScrapeProtocols(sps []ScrapeProtocol) error {
        if len(sps) == 0 {
                return errors.New("scrape_protocols cannot be empty")
        }
        dups := map[string]struct{}{}
        for _, sp := range sps {
                if _, ok := dups[strings.ToLower(string(sp))]; ok {
                        return fmt.Errorf("duplicated protocol in scrape_protocols, got %v", sps)
                }
                if err := sp.Validate(); err != nil {
                        return fmt.Errorf("scrape_protocols: %w", err)
                }
                dups[strings.ToLower(string(sp))] = struct{}{}
        }
        return nil
}

// SetDirectory joins any relative file paths with dir.
func (c *GlobalConfig) SetDirectory(dir string) {
        c.QueryLogFile = config.JoinDir(dir, c.QueryLogFile)
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *GlobalConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
        // Create a clean global config as the previous one was already populated
        // by the default due to the YAML parser behavior for empty blocks.
        gc := &GlobalConfig{}
        type plain GlobalConfig
        if err := unmarshal((*plain)(gc)); err != nil {
                return err
        }

        if err := gc.ExternalLabels.Validate(func(l labels.Label) error {
                if !model.LabelName(l.Name).IsValid() {
                        return fmt.Errorf("%q is not a valid label name", l.Name)
                }
                if !model.LabelValue(l.Value).IsValid() {
                        return fmt.Errorf("%q is not a valid label value", l.Value)
                }
                return nil
        }); err != nil {
                return err
        }

        // First set the correct scrape interval, then check that the timeout
        // (inferred or explicit) is not greater than that.
        if gc.ScrapeInterval == 0 {
                gc.ScrapeInterval = DefaultGlobalConfig.ScrapeInterval
        }
        if gc.ScrapeTimeout > gc.ScrapeInterval {
                return errors.New("global scrape timeout greater than scrape interval")
        }
        if gc.ScrapeTimeout == 0 {
                if DefaultGlobalConfig.ScrapeTimeout > gc.ScrapeInterval {
                        gc.ScrapeTimeout = gc.ScrapeInterval
                } else {
                        gc.ScrapeTimeout = DefaultGlobalConfig.ScrapeTimeout
                }
        }
        if gc.EvaluationInterval == 0 {
                gc.EvaluationInterval = DefaultGlobalConfig.EvaluationInterval
        }

        if gc.ScrapeProtocols == nil {
                gc.ScrapeProtocols = DefaultGlobalConfig.ScrapeProtocols
        }
        if err := validateAcceptScrapeProtocols(gc.ScrapeProtocols); err != nil {
                return fmt.Errorf("%w for global config", err)
        }

        *c = *gc
        return nil
}

// isZero returns true iff the global config is the zero value.
func (c *GlobalConfig) isZero() bool {
        return c.ExternalLabels.IsEmpty() &&
                c.ScrapeInterval == 0 &&
                c.ScrapeTimeout == 0 &&
                c.EvaluationInterval == 0 &&
                c.RuleQueryOffset == 0 &&
                c.QueryLogFile == "" &&
                c.ScrapeProtocols == nil
}

// RuntimeConfig configures the values for the process behavior.
type RuntimeConfig struct {
        // The Go garbage collection target percentage.
        GoGC int `yaml:"gogc,omitempty"`
}

// isZero returns true iff the global config is the zero value.
func (c *RuntimeConfig) isZero() bool {
        return c.GoGC == 0
}

type ScrapeConfigs struct {
        ScrapeConfigs []*ScrapeConfig `yaml:"scrape_configs,omitempty"`
}

// ScrapeConfig configures a scraping unit for Prometheus.
type ScrapeConfig struct {
        // The job name to which the job label is set by default.
        JobName string `yaml:"job_name"`
        // Indicator whether the scraped metrics should remain unmodified.
        HonorLabels bool `yaml:"honor_labels,omitempty"`
        // Indicator whether the scraped timestamps should be respected.
        HonorTimestamps bool `yaml:"honor_timestamps"`
        // Indicator whether to track the staleness of the scraped timestamps.
        TrackTimestampsStaleness bool `yaml:"track_timestamps_staleness"`
        // A set of query parameters with which the target is scraped.
        Params url.Values `yaml:"params,omitempty"`
        // How frequently to scrape the targets of this scrape config.
        ScrapeInterval model.Duration `yaml:"scrape_interval,omitempty"`
        // The timeout for scraping targets of this config.
        ScrapeTimeout model.Duration `yaml:"scrape_timeout,omitempty"`
        // The protocols to negotiate during a scrape. It tells clients what
        // protocol are accepted by Prometheus and with what preference (most wanted is first).
        // Supported values (case sensitive): PrometheusProto, OpenMetricsText0.0.1,
        // OpenMetricsText1.0.0, PrometheusText0.0.4.
        ScrapeProtocols []ScrapeProtocol `yaml:"scrape_protocols,omitempty"`
        // Whether to scrape a classic histogram that is also exposed as a native histogram.
        ScrapeClassicHistograms bool `yaml:"scrape_classic_histograms,omitempty"`
        // The HTTP resource path on which to fetch metrics from targets.
        MetricsPath string `yaml:"metrics_path,omitempty"`
        // The URL scheme with which to fetch metrics from targets.
        Scheme string `yaml:"scheme,omitempty"`
        // Indicator whether to request compressed response from the target.
        EnableCompression bool `yaml:"enable_compression"`
        // An uncompressed response body larger than this many bytes will cause the
        // scrape to fail. 0 means no limit.
        BodySizeLimit units.Base2Bytes `yaml:"body_size_limit,omitempty"`
        // More than this many samples post metric-relabeling will cause the scrape to
        // fail. 0 means no limit.
        SampleLimit uint `yaml:"sample_limit,omitempty"`
        // More than this many targets after the target relabeling will cause the
        // scrapes to fail. 0 means no limit.
        TargetLimit uint `yaml:"target_limit,omitempty"`
        // More than this many labels post metric-relabeling will cause the scrape to
        // fail. 0 means no limit.
        LabelLimit uint `yaml:"label_limit,omitempty"`
        // More than this label name length post metric-relabeling will cause the
        // scrape to fail. 0 means no limit.
        LabelNameLengthLimit uint `yaml:"label_name_length_limit,omitempty"`
        // More than this label value length post metric-relabeling will cause the
        // scrape to fail. 0 means no limit.
        LabelValueLengthLimit uint `yaml:"label_value_length_limit,omitempty"`
        // If there are more than this many buckets in a native histogram,
        // buckets will be merged to stay within the limit.
        NativeHistogramBucketLimit uint `yaml:"native_histogram_bucket_limit,omitempty"`
        // If the growth factor of one bucket to the next is smaller than this,
        // buckets will be merged to increase the factor sufficiently.
        NativeHistogramMinBucketFactor float64 `yaml:"native_histogram_min_bucket_factor,omitempty"`
        // Keep no more than this many dropped targets per job.
        // 0 means no limit.
        KeepDroppedTargets uint `yaml:"keep_dropped_targets,omitempty"`

        // We cannot do proper Go type embedding below as the parser will then parse
        // values arbitrarily into the overflow maps of further-down types.

        ServiceDiscoveryConfigs discovery.Configs       `yaml:"-"`
        HTTPClientConfig        config.HTTPClientConfig `yaml:",inline"`

        // List of target relabel configurations.
        RelabelConfigs []*relabel.Config `yaml:"relabel_configs,omitempty"`
        // List of metric relabel configurations.
        MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty"`
}

// SetDirectory joins any relative file paths with dir.
func (c *ScrapeConfig) SetDirectory(dir string) {
        c.ServiceDiscoveryConfigs.SetDirectory(dir)
        c.HTTPClientConfig.SetDirectory(dir)
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *ScrapeConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *c = DefaultScrapeConfig
        if err := discovery.UnmarshalYAMLWithInlineConfigs(c, unmarshal); err != nil {
                return err
        }
        if len(c.JobName) == 0 {
                return errors.New("job_name is empty")
        }

        // The UnmarshalYAML method of HTTPClientConfig is not being called because it's not a pointer.
        // We cannot make it a pointer as the parser panics for inlined pointer structs.
        // Thus we just do its validation here.
        if err := c.HTTPClientConfig.Validate(); err != nil {
                return err
        }

        // Check for users putting URLs in target groups.
        if len(c.RelabelConfigs) == 0 {
                if err := checkStaticTargets(c.ServiceDiscoveryConfigs); err != nil {
                        return err
                }
        }

        for _, rlcfg := range c.RelabelConfigs {
                if rlcfg == nil {
                        return errors.New("empty or null target relabeling rule in scrape config")
                }
        }
        for _, rlcfg := range c.MetricRelabelConfigs {
                if rlcfg == nil {
                        return errors.New("empty or null metric relabeling rule in scrape config")
                }
        }

        return nil
}

// Validate validates scrape config, but also fills relevant default values from global config if needed.
func (c *ScrapeConfig) Validate(globalConfig GlobalConfig) error {
        if c == nil {
                return errors.New("empty or null scrape config section")
        }
        // First set the correct scrape interval, then check that the timeout
        // (inferred or explicit) is not greater than that.
        if c.ScrapeInterval == 0 {
                c.ScrapeInterval = globalConfig.ScrapeInterval
        }
        if c.ScrapeTimeout > c.ScrapeInterval {
                return fmt.Errorf("scrape timeout greater than scrape interval for scrape config with job name %q", c.JobName)
        }
        if c.ScrapeTimeout == 0 {
                if globalConfig.ScrapeTimeout > c.ScrapeInterval {
                        c.ScrapeTimeout = c.ScrapeInterval
                } else {
                        c.ScrapeTimeout = globalConfig.ScrapeTimeout
                }
        }
        if c.BodySizeLimit == 0 {
                c.BodySizeLimit = globalConfig.BodySizeLimit
        }
        if c.SampleLimit == 0 {
                c.SampleLimit = globalConfig.SampleLimit
        }
        if c.TargetLimit == 0 {
                c.TargetLimit = globalConfig.TargetLimit
        }
        if c.LabelLimit == 0 {
                c.LabelLimit = globalConfig.LabelLimit
        }
        if c.LabelNameLengthLimit == 0 {
                c.LabelNameLengthLimit = globalConfig.LabelNameLengthLimit
        }
        if c.LabelValueLengthLimit == 0 {
                c.LabelValueLengthLimit = globalConfig.LabelValueLengthLimit
        }
        if c.KeepDroppedTargets == 0 {
                c.KeepDroppedTargets = globalConfig.KeepDroppedTargets
        }

        if c.ScrapeProtocols == nil {
                c.ScrapeProtocols = globalConfig.ScrapeProtocols
        }
        if err := validateAcceptScrapeProtocols(c.ScrapeProtocols); err != nil {
                return fmt.Errorf("%w for scrape config with job name %q", err, c.JobName)
        }

        return nil
}

// MarshalYAML implements the yaml.Marshaler interface.
func (c *ScrapeConfig) MarshalYAML() (interface{}, error) {
        return discovery.MarshalYAMLWithInlineConfigs(c)
}

// StorageConfig configures runtime reloadable configuration options.
type StorageConfig struct {
        TSDBConfig      *TSDBConfig      `yaml:"tsdb,omitempty"`
        ExemplarsConfig *ExemplarsConfig `yaml:"exemplars,omitempty"`
}

// TSDBConfig configures runtime reloadable configuration options.
type TSDBConfig struct {
        // OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted
        // into the TSDB. This flag is typically set while unmarshaling the configuration file and translating
        // OutOfOrderTimeWindowFlag's duration. The unit of this flag is expected to be the same as any
        // other timestamp in the TSDB.
        OutOfOrderTimeWindow int64

        // OutOfOrderTimeWindowFlag holds the parsed duration from the config file.
        // During unmarshall, this is converted into milliseconds and stored in OutOfOrderTimeWindow.
        // This should not be used directly and must be converted into OutOfOrderTimeWindow.
        OutOfOrderTimeWindowFlag model.Duration `yaml:"out_of_order_time_window,omitempty"`
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (t *TSDBConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *t = TSDBConfig{}
        type plain TSDBConfig
        if err := unmarshal((*plain)(t)); err != nil {
                return err
        }

        t.OutOfOrderTimeWindow = time.Duration(t.OutOfOrderTimeWindowFlag).Milliseconds()

        return nil
}

type TracingClientType string

const (
        TracingClientHTTP TracingClientType = "http"
        TracingClientGRPC TracingClientType = "grpc"

        GzipCompression = "gzip"
)

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (t *TracingClientType) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *t = TracingClientType("")
        type plain TracingClientType
        if err := unmarshal((*plain)(t)); err != nil {
                return err
        }

        if *t != TracingClientHTTP && *t != TracingClientGRPC {
                return fmt.Errorf("expected tracing client type to be to be %s or %s, but got %s",
                        TracingClientHTTP, TracingClientGRPC, *t,
                )
        }

        return nil
}

// TracingConfig configures the tracing options.
type TracingConfig struct {
        ClientType       TracingClientType `yaml:"client_type,omitempty"`
        Endpoint         string            `yaml:"endpoint,omitempty"`
        SamplingFraction float64           `yaml:"sampling_fraction,omitempty"`
        Insecure         bool              `yaml:"insecure,omitempty"`
        TLSConfig        config.TLSConfig  `yaml:"tls_config,omitempty"`
        Headers          map[string]string `yaml:"headers,omitempty"`
        Compression      string            `yaml:"compression,omitempty"`
        Timeout          model.Duration    `yaml:"timeout,omitempty"`
}

// SetDirectory joins any relative file paths with dir.
func (t *TracingConfig) SetDirectory(dir string) {
        t.TLSConfig.SetDirectory(dir)
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (t *TracingConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *t = TracingConfig{
                ClientType: TracingClientGRPC,
        }
        type plain TracingConfig
        if err := unmarshal((*plain)(t)); err != nil {
                return err
        }

        if err := validateHeadersForTracing(t.Headers); err != nil {
                return err
        }

        if t.Endpoint == "" {
                return errors.New("tracing endpoint must be set")
        }

        if t.Compression != "" && t.Compression != GzipCompression {
                return fmt.Errorf("invalid compression type %s provided, valid options: %s",
                        t.Compression, GzipCompression)
        }

        return nil
}

// ExemplarsConfig configures runtime reloadable configuration options.
type ExemplarsConfig struct {
        // MaxExemplars sets the size, in # of exemplars stored, of the single circular buffer used to store exemplars in memory.
        // Use a value of 0 or less than 0 to disable the storage without having to restart Prometheus.
        MaxExemplars int64 `yaml:"max_exemplars,omitempty"`
}

// AlertingConfig configures alerting and alertmanager related configs.
type AlertingConfig struct {
        AlertRelabelConfigs []*relabel.Config   `yaml:"alert_relabel_configs,omitempty"`
        AlertmanagerConfigs AlertmanagerConfigs `yaml:"alertmanagers,omitempty"`
}

// SetDirectory joins any relative file paths with dir.
func (c *AlertingConfig) SetDirectory(dir string) {
        for _, c := range c.AlertmanagerConfigs {
                c.SetDirectory(dir)
        }
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *AlertingConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
        // Create a clean global config as the previous one was already populated
        // by the default due to the YAML parser behavior for empty blocks.
        *c = AlertingConfig{}
        type plain AlertingConfig
        if err := unmarshal((*plain)(c)); err != nil {
                return err
        }

        for _, rlcfg := range c.AlertRelabelConfigs {
                if rlcfg == nil {
                        return errors.New("empty or null alert relabeling rule")
                }
        }
        return nil
}

// AlertmanagerConfigs is a slice of *AlertmanagerConfig.
type AlertmanagerConfigs []*AlertmanagerConfig

// ToMap converts a slice of *AlertmanagerConfig to a map.
func (a AlertmanagerConfigs) ToMap() map[string]*AlertmanagerConfig {
        ret := make(map[string]*AlertmanagerConfig)
        for i := range a {
                ret[fmt.Sprintf("config-%d", i)] = a[i]
        }
        return ret
}

// AlertmanagerAPIVersion represents a version of the
// github.com/prometheus/alertmanager/api, e.g. 'v1' or 'v2'.
type AlertmanagerAPIVersion string

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (v *AlertmanagerAPIVersion) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *v = AlertmanagerAPIVersion("")
        type plain AlertmanagerAPIVersion
        if err := unmarshal((*plain)(v)); err != nil {
                return err
        }

        for _, supportedVersion := range SupportedAlertmanagerAPIVersions {
                if *v == supportedVersion {
                        return nil
                }
        }

        return fmt.Errorf("expected Alertmanager api version to be one of %v but got %v", SupportedAlertmanagerAPIVersions, *v)
}

const (
        // AlertmanagerAPIVersionV1 represents
        // github.com/prometheus/alertmanager/api/v1.
        AlertmanagerAPIVersionV1 AlertmanagerAPIVersion = "v1"
        // AlertmanagerAPIVersionV2 represents
        // github.com/prometheus/alertmanager/api/v2.
        AlertmanagerAPIVersionV2 AlertmanagerAPIVersion = "v2"
)

var SupportedAlertmanagerAPIVersions = []AlertmanagerAPIVersion{
        AlertmanagerAPIVersionV1, AlertmanagerAPIVersionV2,
}

// AlertmanagerConfig configures how Alertmanagers can be discovered and communicated with.
type AlertmanagerConfig struct {
        // We cannot do proper Go type embedding below as the parser will then parse
        // values arbitrarily into the overflow maps of further-down types.

        ServiceDiscoveryConfigs discovery.Configs       `yaml:"-"`
        HTTPClientConfig        config.HTTPClientConfig `yaml:",inline"`
        SigV4Config             *sigv4.SigV4Config      `yaml:"sigv4,omitempty"`

        // The URL scheme to use when talking to Alertmanagers.
        Scheme string `yaml:"scheme,omitempty"`
        // Path prefix to add in front of the push endpoint path.
        PathPrefix string `yaml:"path_prefix,omitempty"`
        // The timeout used when sending alerts.
        Timeout model.Duration `yaml:"timeout,omitempty"`

        // The api version of Alertmanager.
        APIVersion AlertmanagerAPIVersion `yaml:"api_version"`

        // List of Alertmanager relabel configurations.
        RelabelConfigs []*relabel.Config `yaml:"relabel_configs,omitempty"`
        // Relabel alerts before sending to the specific alertmanager.
        AlertRelabelConfigs []*relabel.Config `yaml:"alert_relabel_configs,omitempty"`
}

// SetDirectory joins any relative file paths with dir.
func (c *AlertmanagerConfig) SetDirectory(dir string) {
        c.ServiceDiscoveryConfigs.SetDirectory(dir)
        c.HTTPClientConfig.SetDirectory(dir)
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *AlertmanagerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *c = DefaultAlertmanagerConfig
        if err := discovery.UnmarshalYAMLWithInlineConfigs(c, unmarshal); err != nil {
                return err
        }

        // The UnmarshalYAML method of HTTPClientConfig is not being called because it's not a pointer.
        // We cannot make it a pointer as the parser panics for inlined pointer structs.
        // Thus we just do its validation here.
        if err := c.HTTPClientConfig.Validate(); err != nil {
                return err
        }

        httpClientConfigAuthEnabled := c.HTTPClientConfig.BasicAuth != nil ||
                c.HTTPClientConfig.Authorization != nil || c.HTTPClientConfig.OAuth2 != nil

        if httpClientConfigAuthEnabled && c.SigV4Config != nil {
                return fmt.Errorf("at most one of basic_auth, authorization, oauth2, & sigv4 must be configured")
        }

        // Check for users putting URLs in target groups.
        if len(c.RelabelConfigs) == 0 {
                if err := checkStaticTargets(c.ServiceDiscoveryConfigs); err != nil {
                        return err
                }
        }

        for _, rlcfg := range c.RelabelConfigs {
                if rlcfg == nil {
                        return errors.New("empty or null Alertmanager target relabeling rule")
                }
        }

        for _, rlcfg := range c.AlertRelabelConfigs {
                if rlcfg == nil {
                        return errors.New("empty or null Alertmanager alert relabeling rule")
                }
        }

        return nil
}

// MarshalYAML implements the yaml.Marshaler interface.
func (c *AlertmanagerConfig) MarshalYAML() (interface{}, error) {
        return discovery.MarshalYAMLWithInlineConfigs(c)
}

func checkStaticTargets(configs discovery.Configs) error {
        for _, cfg := range configs {
                sc, ok := cfg.(discovery.StaticConfig)
                if !ok {
                        continue
                }
                for _, tg := range sc {
                        for _, t := range tg.Targets {
                                if err := CheckTargetAddress(t[model.AddressLabel]); err != nil {
                                        return err
                                }
                        }
                }
        }
        return nil
}

// CheckTargetAddress checks if target address is valid.
func CheckTargetAddress(address model.LabelValue) error {
        // For now check for a URL, we may want to expand this later.
        if strings.Contains(string(address), "/") {
                return fmt.Errorf("%q is not a valid hostname", address)
        }
        return nil
}

// RemoteWriteConfig is the configuration for writing to remote storage.
type RemoteWriteConfig struct {
        URL                  *config.URL       `yaml:"url"`
        RemoteTimeout        model.Duration    `yaml:"remote_timeout,omitempty"`
        Headers              map[string]string `yaml:"headers,omitempty"`
        WriteRelabelConfigs  []*relabel.Config `yaml:"write_relabel_configs,omitempty"`
        Name                 string            `yaml:"name,omitempty"`
        SendExemplars        bool              `yaml:"send_exemplars,omitempty"`
        SendNativeHistograms bool              `yaml:"send_native_histograms,omitempty"`

        // We cannot do proper Go type embedding below as the parser will then parse
        // values arbitrarily into the overflow maps of further-down types.
        HTTPClientConfig config.HTTPClientConfig `yaml:",inline"`
        QueueConfig      QueueConfig             `yaml:"queue_config,omitempty"`
        MetadataConfig   MetadataConfig          `yaml:"metadata_config,omitempty"`
        SigV4Config      *sigv4.SigV4Config      `yaml:"sigv4,omitempty"`
        AzureADConfig    *azuread.AzureADConfig  `yaml:"azuread,omitempty"`
}

// SetDirectory joins any relative file paths with dir.
func (c *RemoteWriteConfig) SetDirectory(dir string) {
        c.HTTPClientConfig.SetDirectory(dir)
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *RemoteWriteConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *c = DefaultRemoteWriteConfig
        type plain RemoteWriteConfig
        if err := unmarshal((*plain)(c)); err != nil {
                return err
        }
        if c.URL == nil {
                return errors.New("url for remote_write is empty")
        }
        for _, rlcfg := range c.WriteRelabelConfigs {
                if rlcfg == nil {
                        return errors.New("empty or null relabeling rule in remote write config")
                }
        }
        if err := validateHeaders(c.Headers); err != nil {
                return err
        }

        // The UnmarshalYAML method of HTTPClientConfig is not being called because it's not a pointer.
        // We cannot make it a pointer as the parser panics for inlined pointer structs.
        // Thus we just do its validation here.
        if err := c.HTTPClientConfig.Validate(); err != nil {
                return err
        }

        httpClientConfigAuthEnabled := c.HTTPClientConfig.BasicAuth != nil ||
                c.HTTPClientConfig.Authorization != nil || c.HTTPClientConfig.OAuth2 != nil

        if httpClientConfigAuthEnabled && (c.SigV4Config != nil || c.AzureADConfig != nil) {
                return fmt.Errorf("at most one of basic_auth, authorization, oauth2, sigv4, & azuread must be configured")
        }

        if c.SigV4Config != nil && c.AzureADConfig != nil {
                return fmt.Errorf("at most one of basic_auth, authorization, oauth2, sigv4, & azuread must be configured")
        }

        return nil
}

func validateHeadersForTracing(headers map[string]string) error {
        for header := range headers {
                if strings.ToLower(header) == "authorization" {
                        return errors.New("custom authorization header configuration is not yet supported")
                }
                if _, ok := reservedHeaders[strings.ToLower(header)]; ok {
                        return fmt.Errorf("%s is a reserved header. It must not be changed", header)
                }
        }
        return nil
}

func validateHeaders(headers map[string]string) error {
        for header := range headers {
                if strings.ToLower(header) == "authorization" {
                        return errors.New("authorization header must be changed via the basic_auth, authorization, oauth2, sigv4, or azuread parameter")
                }
                if _, ok := reservedHeaders[strings.ToLower(header)]; ok {
                        return fmt.Errorf("%s is a reserved header. It must not be changed", header)
                }
        }
        return nil
}

// QueueConfig is the configuration for the queue used to write to remote
// storage.
type QueueConfig struct {
        // Number of samples to buffer per shard before we block. Defaults to
        // MaxSamplesPerSend.
        Capacity int `yaml:"capacity,omitempty"`

        // Max number of shards, i.e. amount of concurrency.
        MaxShards int `yaml:"max_shards,omitempty"`

        // Min number of shards, i.e. amount of concurrency.
        MinShards int `yaml:"min_shards,omitempty"`

        // Maximum number of samples per send.
        MaxSamplesPerSend int `yaml:"max_samples_per_send,omitempty"`

        // Maximum time sample will wait in buffer.
        BatchSendDeadline model.Duration `yaml:"batch_send_deadline,omitempty"`

        // On recoverable errors, backoff exponentially.
        MinBackoff       model.Duration `yaml:"min_backoff,omitempty"`
        MaxBackoff       model.Duration `yaml:"max_backoff,omitempty"`
        RetryOnRateLimit bool           `yaml:"retry_on_http_429,omitempty"`

        // Samples older than the limit will be dropped.
        SampleAgeLimit model.Duration `yaml:"sample_age_limit,omitempty"`
}

// MetadataConfig is the configuration for sending metadata to remote
// storage.
type MetadataConfig struct {
        // Send controls whether we send metric metadata to remote storage.
        Send bool `yaml:"send"`
        // SendInterval controls how frequently we send metric metadata.
        SendInterval model.Duration `yaml:"send_interval"`
        // Maximum number of samples per send.
        MaxSamplesPerSend int `yaml:"max_samples_per_send,omitempty"`
}

// RemoteReadConfig is the configuration for reading from remote storage.
type RemoteReadConfig struct {
        URL           *config.URL       `yaml:"url"`
        RemoteTimeout model.Duration    `yaml:"remote_timeout,omitempty"`
        Headers       map[string]string `yaml:"headers,omitempty"`
        ReadRecent    bool              `yaml:"read_recent,omitempty"`
        Name          string            `yaml:"name,omitempty"`

        // We cannot do proper Go type embedding below as the parser will then parse
        // values arbitrarily into the overflow maps of further-down types.
        HTTPClientConfig config.HTTPClientConfig `yaml:",inline"`

        // RequiredMatchers is an optional list of equality matchers which have to
        // be present in a selector to query the remote read endpoint.
        RequiredMatchers model.LabelSet `yaml:"required_matchers,omitempty"`

        // Whether to use the external labels as selectors for the remote read endpoint.
        FilterExternalLabels bool `yaml:"filter_external_labels,omitempty"`
}

// SetDirectory joins any relative file paths with dir.
func (c *RemoteReadConfig) SetDirectory(dir string) {
        c.HTTPClientConfig.SetDirectory(dir)
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *RemoteReadConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *c = DefaultRemoteReadConfig
        type plain RemoteReadConfig
        if err := unmarshal((*plain)(c)); err != nil {
                return err
        }
        if c.URL == nil {
                return errors.New("url for remote_read is empty")
        }
        if err := validateHeaders(c.Headers); err != nil {
                return err
        }
        // The UnmarshalYAML method of HTTPClientConfig is not being called because it's not a pointer.
        // We cannot make it a pointer as the parser panics for inlined pointer structs.
        // Thus we just do its validation here.
        return c.HTTPClientConfig.Validate()
}

func filePath(filename string) string {
        absPath, err := filepath.Abs(filename)
        if err != nil {
                return filename
        }
        return absPath
}

func fileErr(filename string, err error) error {
        return fmt.Errorf("%q: %w", filePath(filename), err)
}

func getGoGCEnv() int {
        goGCEnv := os.Getenv("GOGC")
        // If the GOGC env var is set, use the same logic as upstream Go.
        if goGCEnv != "" {
                // Special case for GOGC=off.
                if strings.ToLower(goGCEnv) == "off" {
                        return -1
                }
                i, err := strconv.Atoi(goGCEnv)
                if err == nil {
                        return i
                }
        }
        return DefaultRuntimeConfig.GoGC
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package discovery

// Create a dummy metrics struct, because this SD doesn't have any metrics.
type NoopDiscovererMetrics struct{}

var _ DiscovererMetrics = (*NoopDiscovererMetrics)(nil)

// Register implements discovery.DiscovererMetrics.
func (*NoopDiscovererMetrics) Register() error {
        return nil
}

// Unregister implements discovery.DiscovererMetrics.
func (*NoopDiscovererMetrics) Unregister() {
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package discovery

import (
        "context"
        "reflect"

        "github.com/go-kit/log"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/prometheus/common/config"

        "github.com/prometheus/prometheus/discovery/targetgroup"
)

// Discoverer provides information about target groups. It maintains a set
// of sources from which TargetGroups can originate. Whenever a discovery provider
// detects a potential change, it sends the TargetGroup through its channel.
//
// Discoverer does not know if an actual change happened.
// It does guarantee that it sends the new TargetGroup whenever a change happens.
//
// Discoverers should initially send a full set of all discoverable TargetGroups.
type Discoverer interface {
        // Run hands a channel to the discovery provider (Consul, DNS, etc.) through which
        // it can send updated target groups. It must return when the context is canceled.
        // It should not close the update channel on returning.
        Run(ctx context.Context, up chan<- []*targetgroup.Group)
}

// Internal metrics of service discovery mechanisms.
type DiscovererMetrics interface {
        Register() error
        Unregister()
}

// DiscovererOptions provides options for a Discoverer.
type DiscovererOptions struct {
        Logger log.Logger

        Metrics DiscovererMetrics

        // Extra HTTP client options to expose to Discoverers. This field may be
        // ignored; Discoverer implementations must opt-in to reading it.
        HTTPClientOptions []config.HTTPClientOption
}

// Metrics used by the "refresh" package.
// We define them here in the "discovery" package in order to avoid a cyclic dependency between
// "discovery" and "refresh".
type RefreshMetrics struct {
        Failures prometheus.Counter
        Duration prometheus.Observer
}

// Instantiate the metrics used by the "refresh" package.
type RefreshMetricsInstantiator interface {
        Instantiate(mech string) *RefreshMetrics
}

// An interface for registering, unregistering, and instantiating metrics for the "refresh" package.
// Refresh metrics are registered and unregistered outside of the service discovery mechanism.
// This is so that the same metrics can be reused across different service discovery mechanisms.
// To manage refresh metrics inside the SD mechanism, we'd need to use const labels which are
// specific to that SD. However, doing so would also expose too many unused metrics on
// the Prometheus /metrics endpoint.
type RefreshMetricsManager interface {
        DiscovererMetrics
        RefreshMetricsInstantiator
}

// A Config provides the configuration and constructor for a Discoverer.
type Config interface {
        // Name returns the name of the discovery mechanism.
        Name() string

        // NewDiscoverer returns a Discoverer for the Config
        // with the given DiscovererOptions.
        NewDiscoverer(DiscovererOptions) (Discoverer, error)

        // NewDiscovererMetrics returns the metrics used by the service discovery.
        NewDiscovererMetrics(prometheus.Registerer, RefreshMetricsInstantiator) DiscovererMetrics
}

// Configs is a slice of Config values that uses custom YAML marshaling and unmarshaling
// to represent itself as a mapping of the Config values grouped by their types.
type Configs []Config

// SetDirectory joins any relative file paths with dir.
func (c *Configs) SetDirectory(dir string) {
        for _, c := range *c {
                if v, ok := c.(config.DirectorySetter); ok {
                        v.SetDirectory(dir)
                }
        }
}

// UnmarshalYAML implements yaml.Unmarshaler.
func (c *Configs) UnmarshalYAML(unmarshal func(interface{}) error) error {
        cfgTyp := getConfigType(configsType)
        cfgPtr := reflect.New(cfgTyp)
        cfgVal := cfgPtr.Elem()

        if err := unmarshal(cfgPtr.Interface()); err != nil {
                return replaceYAMLTypeError(err, cfgTyp, configsType)
        }

        var err error
        *c, err = readConfigs(cfgVal, 0)
        return err
}

// MarshalYAML implements yaml.Marshaler.
func (c Configs) MarshalYAML() (interface{}, error) {
        cfgTyp := getConfigType(configsType)
        cfgPtr := reflect.New(cfgTyp)
        cfgVal := cfgPtr.Elem()

        if err := writeConfigs(cfgVal, c); err != nil {
                return nil, err
        }

        return cfgPtr.Interface(), nil
}

// A StaticConfig is a Config that provides a static list of targets.
type StaticConfig []*targetgroup.Group

// Name returns the name of the service discovery mechanism.
func (StaticConfig) Name() string { return "static" }

// NewDiscoverer returns a Discoverer for the Config.
func (c StaticConfig) NewDiscoverer(DiscovererOptions) (Discoverer, error) {
        return staticDiscoverer(c), nil
}

// No metrics are needed for this service discovery mechanism.
func (c StaticConfig) NewDiscovererMetrics(prometheus.Registerer, RefreshMetricsInstantiator) DiscovererMetrics {
        return &NoopDiscovererMetrics{}
}

type staticDiscoverer []*targetgroup.Group

func (c staticDiscoverer) Run(ctx context.Context, up chan<- []*targetgroup.Group) {
        // TODO: existing implementation closes up chan, but documentation explicitly forbids it...?
        defer close(up)
        select {
        case <-ctx.Done():
        case up <- c:
        }
}

// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package discovery

import (
        "context"
        "fmt"
        "reflect"
        "sync"
        "time"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/prometheus/common/config"

        "github.com/prometheus/prometheus/discovery/targetgroup"
)

type poolKey struct {
        setName  string
        provider string
}

// Provider holds a Discoverer instance, its configuration, cancel func and its subscribers.
type Provider struct {
        name   string
        d      Discoverer
        config interface{}

        cancel context.CancelFunc
        // done should be called after cleaning up resources associated with cancelled provider.
        done func()

        mu   sync.RWMutex
        subs map[string]struct{}

        // newSubs is used to temporary store subs to be used upon config reload completion.
        newSubs map[string]struct{}
}

// Discoverer return the Discoverer of the provider.
func (p *Provider) Discoverer() Discoverer {
        return p.d
}

// IsStarted return true if Discoverer is started.
func (p *Provider) IsStarted() bool {
        return p.cancel != nil
}

func (p *Provider) Config() interface{} {
        return p.config
}

// Registers the metrics needed for SD mechanisms.
// Does not register the metrics for the Discovery Manager.
// TODO(ptodev): Add ability to unregister the metrics?
func CreateAndRegisterSDMetrics(reg prometheus.Registerer) (map[string]DiscovererMetrics, error) {
        // Some SD mechanisms use the "refresh" package, which has its own metrics.
        refreshSdMetrics := NewRefreshMetrics(reg)

        // Register the metrics specific for each SD mechanism, and the ones for the refresh package.
        sdMetrics, err := RegisterSDMetrics(reg, refreshSdMetrics)
        if err != nil {
                return nil, fmt.Errorf("failed to register service discovery metrics: %w", err)
        }

        return sdMetrics, nil
}

// NewManager is the Discovery Manager constructor.
func NewManager(ctx context.Context, logger log.Logger, registerer prometheus.Registerer, sdMetrics map[string]DiscovererMetrics, options ...func(*Manager)) *Manager {
        if logger == nil {
                logger = log.NewNopLogger()
        }
        mgr := &Manager{
                logger:      logger,
                syncCh:      make(chan map[string][]*targetgroup.Group),
                targets:     make(map[poolKey]map[string]*targetgroup.Group),
                ctx:         ctx,
                updatert:    5 * time.Second,
                triggerSend: make(chan struct{}, 1),
                registerer:  registerer,
                sdMetrics:   sdMetrics,
        }
        for _, option := range options {
                option(mgr)
        }

        // Register the metrics.
        // We have to do this after setting all options, so that the name of the Manager is set.
        if metrics, err := NewManagerMetrics(registerer, mgr.name); err == nil {
                mgr.metrics = metrics
        } else {
                level.Error(logger).Log("msg", "Failed to create discovery manager metrics", "manager", mgr.name, "err", err)
                return nil
        }

        return mgr
}

// Name sets the name of the manager.
func Name(n string) func(*Manager) {
        return func(m *Manager) {
                m.mtx.Lock()
                defer m.mtx.Unlock()
                m.name = n
        }
}

// Updatert sets the updatert of the manager.
// Used to speed up tests.
func Updatert(u time.Duration) func(*Manager) {
        return func(m *Manager) {
                m.mtx.Lock()
                defer m.mtx.Unlock()
                m.updatert = u
        }
}

// HTTPClientOptions sets the list of HTTP client options to expose to
// Discoverers. It is up to Discoverers to choose to use the options provided.
func HTTPClientOptions(opts ...config.HTTPClientOption) func(*Manager) {
        return func(m *Manager) {
                m.httpOpts = opts
        }
}

// Manager maintains a set of discovery providers and sends each update to a map channel.
// Targets are grouped by the target set name.
type Manager struct {
        logger   log.Logger
        name     string
        httpOpts []config.HTTPClientOption
        mtx      sync.RWMutex
        ctx      context.Context

        // Some Discoverers(e.g. k8s) send only the updates for a given target group,
        // so we use map[tg.Source]*targetgroup.Group to know which group to update.
        targets    map[poolKey]map[string]*targetgroup.Group
        targetsMtx sync.Mutex

        // providers keeps track of SD providers.
        providers []*Provider
        // The sync channel sends the updates as a map where the key is the job value from the scrape config.
        syncCh chan map[string][]*targetgroup.Group

        // How long to wait before sending updates to the channel. The variable
        // should only be modified in unit tests.
        updatert time.Duration

        // The triggerSend channel signals to the Manager that new updates have been received from providers.
        triggerSend chan struct{}

        // lastProvider counts providers registered during Manager's lifetime.
        lastProvider uint

        // A registerer for all service discovery metrics.
        registerer prometheus.Registerer

        metrics   *Metrics
        sdMetrics map[string]DiscovererMetrics
}

// Providers returns the currently configured SD providers.
func (m *Manager) Providers() []*Provider {
        return m.providers
}

// UnregisterMetrics unregisters manager metrics. It does not unregister
// service discovery or refresh metrics, whose lifecycle is managed independent
// of the discovery Manager.
func (m *Manager) UnregisterMetrics() {
        m.metrics.Unregister(m.registerer)
}

// Run starts the background processing.
func (m *Manager) Run() error {
        go m.sender()
        <-m.ctx.Done()
        m.cancelDiscoverers()
        return m.ctx.Err()
}

// SyncCh returns a read only channel used by all the clients to receive target updates.
func (m *Manager) SyncCh() <-chan map[string][]*targetgroup.Group {
        return m.syncCh
}

// ApplyConfig checks if discovery provider with supplied config is already running and keeps them as is.
// Remaining providers are then stopped and new required providers are started using the provided config.
func (m *Manager) ApplyConfig(cfg map[string]Configs) error {
        m.mtx.Lock()
        defer m.mtx.Unlock()

        var failedCount int
        for name, scfg := range cfg {
                failedCount += m.registerProviders(scfg, name)
        }
        m.metrics.FailedConfigs.Set(float64(failedCount))

        var (
                wg sync.WaitGroup
                // keep shows if we keep any providers after reload.
                keep         bool
                newProviders []*Provider
        )
        for _, prov := range m.providers {
                // Cancel obsolete providers.
                if len(prov.newSubs) == 0 {
                        wg.Add(1)
                        prov.done = func() {
                                wg.Done()
                        }
                        prov.cancel()
                        continue
                }
                newProviders = append(newProviders, prov)
                // refTargets keeps reference targets used to populate new subs' targets
                var refTargets map[string]*targetgroup.Group
                prov.mu.Lock()

                m.targetsMtx.Lock()
                for s := range prov.subs {
                        keep = true
                        refTargets = m.targets[poolKey{s, prov.name}]
                        // Remove obsolete subs' targets.
                        if _, ok := prov.newSubs[s]; !ok {
                                delete(m.targets, poolKey{s, prov.name})
                                m.metrics.DiscoveredTargets.DeleteLabelValues(m.name, s)
                        }
                }
                // Set metrics and targets for new subs.
                for s := range prov.newSubs {
                        if _, ok := prov.subs[s]; !ok {
                                m.metrics.DiscoveredTargets.WithLabelValues(s).Set(0)
                        }
                        if l := len(refTargets); l > 0 {
                                m.targets[poolKey{s, prov.name}] = make(map[string]*targetgroup.Group, l)
                                for k, v := range refTargets {
                                        m.targets[poolKey{s, prov.name}][k] = v
                                }
                        }
                }
                m.targetsMtx.Unlock()

                prov.subs = prov.newSubs
                prov.newSubs = map[string]struct{}{}
                prov.mu.Unlock()
                if !prov.IsStarted() {
                        m.startProvider(m.ctx, prov)
                }
        }
        // Currently downstream managers expect full target state upon config reload, so we must oblige.
        // While startProvider does pull the trigger, it may take some time to do so, therefore
        // we pull the trigger as soon as possible so that downstream managers can populate their state.
        // See https://github.com/prometheus/prometheus/pull/8639 for details.
        if keep {
                select {
                case m.triggerSend <- struct{}{}:
                default:
                }
        }
        m.providers = newProviders
        wg.Wait()

        return nil
}

// StartCustomProvider is used for sdtool. Only use this if you know what you're doing.
func (m *Manager) StartCustomProvider(ctx context.Context, name string, worker Discoverer) {
        p := &Provider{
                name: name,
                d:    worker,
                subs: map[string]struct{}{
                        name: {},
                },
        }
        m.providers = append(m.providers, p)
        m.startProvider(ctx, p)
}

func (m *Manager) startProvider(ctx context.Context, p *Provider) {
        level.Debug(m.logger).Log("msg", "Starting provider", "provider", p.name, "subs", fmt.Sprintf("%v", p.subs))
        ctx, cancel := context.WithCancel(ctx)
        updates := make(chan []*targetgroup.Group)

        p.cancel = cancel

        go p.d.Run(ctx, updates)
        go m.updater(ctx, p, updates)
}

// cleaner cleans resources associated with provider.
func (m *Manager) cleaner(p *Provider) {
        m.targetsMtx.Lock()
        p.mu.RLock()
        for s := range p.subs {
                delete(m.targets, poolKey{s, p.name})
        }
        p.mu.RUnlock()
        m.targetsMtx.Unlock()
        if p.done != nil {
                p.done()
        }
}

func (m *Manager) updater(ctx context.Context, p *Provider, updates chan []*targetgroup.Group) {
        // Ensure targets from this provider are cleaned up.
        defer m.cleaner(p)
        for {
                select {
                case <-ctx.Done():
                        return
                case tgs, ok := <-updates:
                        m.metrics.ReceivedUpdates.Inc()
                        if !ok {
                                level.Debug(m.logger).Log("msg", "Discoverer channel closed", "provider", p.name)
                                // Wait for provider cancellation to ensure targets are cleaned up when expected.
                                <-ctx.Done()
                                return
                        }

                        p.mu.RLock()
                        for s := range p.subs {
                                m.updateGroup(poolKey{setName: s, provider: p.name}, tgs)
                        }
                        p.mu.RUnlock()

                        select {
                        case m.triggerSend <- struct{}{}:
                        default:
                        }
                }
        }
}

func (m *Manager) sender() {
        ticker := time.NewTicker(m.updatert)
        defer ticker.Stop()

        for {
                select {
                case <-m.ctx.Done():
                        return
                case <-ticker.C: // Some discoverers send updates too often, so we throttle these with the ticker.
                        select {
                        case <-m.triggerSend:
                                m.metrics.SentUpdates.Inc()
                                select {
                                case m.syncCh <- m.allGroups():
                                default:
                                        m.metrics.DelayedUpdates.Inc()
                                        level.Debug(m.logger).Log("msg", "Discovery receiver's channel was full so will retry the next cycle")
                                        select {
                                        case m.triggerSend <- struct{}{}:
                                        default:
                                        }
                                }
                        default:
                        }
                }
        }
}

func (m *Manager) cancelDiscoverers() {
        m.mtx.RLock()
        defer m.mtx.RUnlock()
        for _, p := range m.providers {
                if p.cancel != nil {
                        p.cancel()
                }
        }
}

func (m *Manager) updateGroup(poolKey poolKey, tgs []*targetgroup.Group) {
        m.targetsMtx.Lock()
        defer m.targetsMtx.Unlock()

        if _, ok := m.targets[poolKey]; !ok {
                m.targets[poolKey] = make(map[string]*targetgroup.Group)
        }
        for _, tg := range tgs {
                if tg != nil { // Some Discoverers send nil target group so need to check for it to avoid panics.
                        m.targets[poolKey][tg.Source] = tg
                }
        }
}

func (m *Manager) allGroups() map[string][]*targetgroup.Group {
        tSets := map[string][]*targetgroup.Group{}
        n := map[string]int{}

        m.targetsMtx.Lock()
        defer m.targetsMtx.Unlock()
        for pkey, tsets := range m.targets {
                for _, tg := range tsets {
                        // Even if the target group 'tg' is empty we still need to send it to the 'Scrape manager'
                        // to signal that it needs to stop all scrape loops for this target set.
                        tSets[pkey.setName] = append(tSets[pkey.setName], tg)
                        n[pkey.setName] += len(tg.Targets)
                }
        }
        for setName, v := range n {
                m.metrics.DiscoveredTargets.WithLabelValues(setName).Set(float64(v))
        }
        return tSets
}

// registerProviders returns a number of failed SD config.
func (m *Manager) registerProviders(cfgs Configs, setName string) int {
        var (
                failed int
                added  bool
        )
        add := func(cfg Config) {
                for _, p := range m.providers {
                        if reflect.DeepEqual(cfg, p.config) {
                                p.newSubs[setName] = struct{}{}
                                added = true
                                return
                        }
                }
                typ := cfg.Name()
                d, err := cfg.NewDiscoverer(DiscovererOptions{
                        Logger:            log.With(m.logger, "discovery", typ, "config", setName),
                        HTTPClientOptions: m.httpOpts,
                        Metrics:           m.sdMetrics[typ],
                })
                if err != nil {
                        level.Error(m.logger).Log("msg", "Cannot create service discovery", "err", err, "type", typ, "config", setName)
                        failed++
                        return
                }
                m.providers = append(m.providers, &Provider{
                        name:   fmt.Sprintf("%s/%d", typ, m.lastProvider),
                        d:      d,
                        config: cfg,
                        newSubs: map[string]struct{}{
                                setName: {},
                        },
                })
                m.lastProvider++
                added = true
        }
        for _, cfg := range cfgs {
                add(cfg)
        }
        if !added {
                // Add an empty target group to force the refresh of the corresponding
                // scrape pool and to notify the receiver that this target set has no
                // current targets.
                // It can happen because the combined set of SD configurations is empty
                // or because we fail to instantiate all the SD configurations.
                add(StaticConfig{{}})
        }
        return failed
}

// StaticProvider holds a list of target groups that never change.
type StaticProvider struct {
        TargetGroups []*targetgroup.Group
}

// Run implements the Worker interface.
func (sd *StaticProvider) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
        // We still have to consider that the consumer exits right away in which case
        // the context will be canceled.
        select {
        case ch <- sd.TargetGroups:
        case <-ctx.Done():
        }
        close(ch)
}

// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package discovery

import (
        "fmt"

        "github.com/prometheus/client_golang/prometheus"
)

// Metrics to be used with a discovery manager.
type Metrics struct {
        FailedConfigs     prometheus.Gauge
        DiscoveredTargets *prometheus.GaugeVec
        ReceivedUpdates   prometheus.Counter
        DelayedUpdates    prometheus.Counter
        SentUpdates       prometheus.Counter
}

func NewManagerMetrics(registerer prometheus.Registerer, sdManagerName string) (*Metrics, error) {
        m := &Metrics{}

        m.FailedConfigs = prometheus.NewGauge(
                prometheus.GaugeOpts{
                        Name:        "prometheus_sd_failed_configs",
                        Help:        "Current number of service discovery configurations that failed to load.",
                        ConstLabels: prometheus.Labels{"name": sdManagerName},
                },
        )

        m.DiscoveredTargets = prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                        Name:        "prometheus_sd_discovered_targets",
                        Help:        "Current number of discovered targets.",
                        ConstLabels: prometheus.Labels{"name": sdManagerName},
                },
                []string{"config"},
        )

        m.ReceivedUpdates = prometheus.NewCounter(
                prometheus.CounterOpts{
                        Name:        "prometheus_sd_received_updates_total",
                        Help:        "Total number of update events received from the SD providers.",
                        ConstLabels: prometheus.Labels{"name": sdManagerName},
                },
        )

        m.DelayedUpdates = prometheus.NewCounter(
                prometheus.CounterOpts{
                        Name:        "prometheus_sd_updates_delayed_total",
                        Help:        "Total number of update events that couldn't be sent immediately.",
                        ConstLabels: prometheus.Labels{"name": sdManagerName},
                },
        )

        m.SentUpdates = prometheus.NewCounter(
                prometheus.CounterOpts{
                        Name:        "prometheus_sd_updates_total",
                        Help:        "Total number of update events sent to the SD consumers.",
                        ConstLabels: prometheus.Labels{"name": sdManagerName},
                },
        )

        metrics := []prometheus.Collector{
                m.FailedConfigs,
                m.DiscoveredTargets,
                m.ReceivedUpdates,
                m.DelayedUpdates,
                m.SentUpdates,
        }

        for _, collector := range metrics {
                err := registerer.Register(collector)
                if err != nil {
                        return nil, fmt.Errorf("failed to register discovery manager metrics: %w", err)
                }
        }

        return m, nil
}

// Unregister unregisters all metrics.
func (m *Metrics) Unregister(registerer prometheus.Registerer) {
        registerer.Unregister(m.FailedConfigs)
        registerer.Unregister(m.DiscoveredTargets)
        registerer.Unregister(m.ReceivedUpdates)
        registerer.Unregister(m.DelayedUpdates)
        registerer.Unregister(m.SentUpdates)
}

// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package discovery

import (
        "context"
        "fmt"
        "net/url"
        "time"

        "github.com/prometheus/client_golang/prometheus"
        "k8s.io/client-go/tools/metrics"
        "k8s.io/client-go/util/workqueue"
)

// This file registers metrics used by the Kubernetes Go client (k8s.io/client-go).
// Unfortunately, k8s.io/client-go metrics are global.
// If we instantiate multiple k8s SD instances, their k8s/client-go metrics will overlap.
// To prevent us from displaying misleading metrics, we register k8s.io/client-go metrics
// outside of the Kubernetes SD.

const (
        KubernetesMetricsNamespace = "prometheus_sd_kubernetes"
        workqueueMetricsNamespace  = KubernetesMetricsNamespace + "_workqueue"
)

var (
        clientGoRequestMetrics  = &clientGoRequestMetricAdapter{}
        clientGoWorkloadMetrics = &clientGoWorkqueueMetricsProvider{}
)

var (
        // Metrics for client-go's HTTP requests.
        clientGoRequestResultMetricVec = prometheus.NewCounterVec(
                prometheus.CounterOpts{
                        Namespace: KubernetesMetricsNamespace,
                        Name:      "http_request_total",
                        Help:      "Total number of HTTP requests to the Kubernetes API by status code.",
                },
                []string{"status_code"},
        )
        clientGoRequestLatencyMetricVec = prometheus.NewSummaryVec(
                prometheus.SummaryOpts{
                        Namespace:  KubernetesMetricsNamespace,
                        Name:       "http_request_duration_seconds",
                        Help:       "Summary of latencies for HTTP requests to the Kubernetes API by endpoint.",
                        Objectives: map[float64]float64{},
                },
                []string{"endpoint"},
        )

        // Definition of metrics for client-go workflow metrics provider.
        clientGoWorkqueueDepthMetricVec = prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                        Namespace: workqueueMetricsNamespace,
                        Name:      "depth",
                        Help:      "Current depth of the work queue.",
                },
                []string{"queue_name"},
        )
        clientGoWorkqueueAddsMetricVec = prometheus.NewCounterVec(
                prometheus.CounterOpts{
                        Namespace: workqueueMetricsNamespace,
                        Name:      "items_total",
                        Help:      "Total number of items added to the work queue.",
                },
                []string{"queue_name"},
        )
        clientGoWorkqueueLatencyMetricVec = prometheus.NewSummaryVec(
                prometheus.SummaryOpts{
                        Namespace:  workqueueMetricsNamespace,
                        Name:       "latency_seconds",
                        Help:       "How long an item stays in the work queue.",
                        Objectives: map[float64]float64{},
                },
                []string{"queue_name"},
        )
        clientGoWorkqueueUnfinishedWorkSecondsMetricVec = prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                        Namespace: workqueueMetricsNamespace,
                        Name:      "unfinished_work_seconds",
                        Help:      "How long an item has remained unfinished in the work queue.",
                },
                []string{"queue_name"},
        )
        clientGoWorkqueueLongestRunningProcessorMetricVec = prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                        Namespace: workqueueMetricsNamespace,
                        Name:      "longest_running_processor_seconds",
                        Help:      "Duration of the longest running processor in the work queue.",
                },
                []string{"queue_name"},
        )
        clientGoWorkqueueWorkDurationMetricVec = prometheus.NewSummaryVec(
                prometheus.SummaryOpts{
                        Namespace:  workqueueMetricsNamespace,
                        Name:       "work_duration_seconds",
                        Help:       "How long processing an item from the work queue takes.",
                        Objectives: map[float64]float64{},
                },
                []string{"queue_name"},
        )
)

// Definition of dummy metric used as a placeholder if we don't want to observe some data.
type noopMetric struct{}

func (noopMetric) Inc()            {}
func (noopMetric) Dec()            {}
func (noopMetric) Observe(float64) {}
func (noopMetric) Set(float64)     {}

// Definition of client-go metrics adapters for HTTP requests observation.
type clientGoRequestMetricAdapter struct{}

// Returns all of the Prometheus metrics derived from k8s.io/client-go.
// This may be used tu register and unregister the metrics.
func clientGoMetrics() []prometheus.Collector {
        return []prometheus.Collector{
                clientGoRequestResultMetricVec,
                clientGoRequestLatencyMetricVec,
                clientGoWorkqueueDepthMetricVec,
                clientGoWorkqueueAddsMetricVec,
                clientGoWorkqueueLatencyMetricVec,
                clientGoWorkqueueUnfinishedWorkSecondsMetricVec,
                clientGoWorkqueueLongestRunningProcessorMetricVec,
                clientGoWorkqueueWorkDurationMetricVec,
        }
}

func RegisterK8sClientMetricsWithPrometheus(registerer prometheus.Registerer) error {
        clientGoRequestMetrics.RegisterWithK8sGoClient()
        clientGoWorkloadMetrics.RegisterWithK8sGoClient()

        for _, collector := range clientGoMetrics() {
                err := registerer.Register(collector)
                if err != nil {
                        return fmt.Errorf("failed to register Kubernetes Go Client metrics: %w", err)
                }
        }
        return nil
}

func (f *clientGoRequestMetricAdapter) RegisterWithK8sGoClient() {
        metrics.Register(
                metrics.RegisterOpts{
                        RequestLatency: f,
                        RequestResult:  f,
                },
        )
}

func (clientGoRequestMetricAdapter) Increment(_ context.Context, code, _, _ string) {
        clientGoRequestResultMetricVec.WithLabelValues(code).Inc()
}

func (clientGoRequestMetricAdapter) Observe(_ context.Context, _ string, u url.URL, latency time.Duration) {
        clientGoRequestLatencyMetricVec.WithLabelValues(u.EscapedPath()).Observe(latency.Seconds())
}

// Definition of client-go workqueue metrics provider definition.
type clientGoWorkqueueMetricsProvider struct{}

func (f *clientGoWorkqueueMetricsProvider) RegisterWithK8sGoClient() {
        workqueue.SetProvider(f)
}

func (f *clientGoWorkqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric {
        return clientGoWorkqueueDepthMetricVec.WithLabelValues(name)
}

func (f *clientGoWorkqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric {
        return clientGoWorkqueueAddsMetricVec.WithLabelValues(name)
}

func (f *clientGoWorkqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric {
        return clientGoWorkqueueLatencyMetricVec.WithLabelValues(name)
}

func (f *clientGoWorkqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric {
        return clientGoWorkqueueWorkDurationMetricVec.WithLabelValues(name)
}

func (f *clientGoWorkqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric {
        return clientGoWorkqueueUnfinishedWorkSecondsMetricVec.WithLabelValues(name)
}

func (f *clientGoWorkqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric {
        return clientGoWorkqueueLongestRunningProcessorMetricVec.WithLabelValues(name)
}

func (clientGoWorkqueueMetricsProvider) NewRetriesMetric(string) workqueue.CounterMetric {
        // Retries are not used so the metric is omitted.
        return noopMetric{}
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package discovery

import (
        "github.com/prometheus/client_golang/prometheus"
)

// Metric vectors for the "refresh" package.
// We define them here in the "discovery" package in order to avoid a cyclic dependency between
// "discovery" and "refresh".
type RefreshMetricsVecs struct {
        failuresVec *prometheus.CounterVec
        durationVec *prometheus.SummaryVec

        metricRegisterer MetricRegisterer
}

var _ RefreshMetricsManager = (*RefreshMetricsVecs)(nil)

func NewRefreshMetrics(reg prometheus.Registerer) RefreshMetricsManager {
        m := &RefreshMetricsVecs{
                failuresVec: prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "prometheus_sd_refresh_failures_total",
                                Help: "Number of refresh failures for the given SD mechanism.",
                        },
                        []string{"mechanism"}),
                durationVec: prometheus.NewSummaryVec(
                        prometheus.SummaryOpts{
                                Name:       "prometheus_sd_refresh_duration_seconds",
                                Help:       "The duration of a refresh in seconds for the given SD mechanism.",
                                Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
                        },
                        []string{"mechanism"}),
        }

        // The reason we register metric vectors instead of metrics is so that
        // the metrics are not visible until they are recorded.
        m.metricRegisterer = NewMetricRegisterer(reg, []prometheus.Collector{
                m.failuresVec,
                m.durationVec,
        })

        return m
}

// Instantiate returns metrics out of metric vectors.
func (m *RefreshMetricsVecs) Instantiate(mech string) *RefreshMetrics {
        return &RefreshMetrics{
                Failures: m.failuresVec.WithLabelValues(mech),
                Duration: m.durationVec.WithLabelValues(mech),
        }
}

// Register implements discovery.DiscovererMetrics.
func (m *RefreshMetricsVecs) Register() error {
        return m.metricRegisterer.RegisterMetrics()
}

// Unregister implements discovery.DiscovererMetrics.
func (m *RefreshMetricsVecs) Unregister() {
        m.metricRegisterer.UnregisterMetrics()
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package discovery

import (
        "errors"
        "fmt"
        "reflect"
        "sort"
        "strconv"
        "strings"
        "sync"

        "gopkg.in/yaml.v2"

        "github.com/prometheus/client_golang/prometheus"

        "github.com/prometheus/prometheus/discovery/targetgroup"
)

const (
        configFieldPrefix      = "AUTO_DISCOVERY_"
        staticConfigsKey       = "static_configs"
        staticConfigsFieldName = configFieldPrefix + staticConfigsKey
)

var (
        configNames      = make(map[string]Config)
        configFieldNames = make(map[reflect.Type]string)
        configFields     []reflect.StructField

        configTypesMu sync.Mutex
        configTypes   = make(map[reflect.Type]reflect.Type)

        emptyStructType = reflect.TypeOf(struct{}{})
        configsType     = reflect.TypeOf(Configs{})
)

// RegisterConfig registers the given Config type for YAML marshaling and unmarshaling.
func RegisterConfig(config Config) {
        registerConfig(config.Name()+"_sd_configs", reflect.TypeOf(config), config)
}

func init() {
        // N.B.: static_configs is the only Config type implemented by default.
        // All other types are registered at init by their implementing packages.
        elemTyp := reflect.TypeOf(&targetgroup.Group{})
        registerConfig(staticConfigsKey, elemTyp, StaticConfig{})
}

func registerConfig(yamlKey string, elemType reflect.Type, config Config) {
        name := config.Name()
        if _, ok := configNames[name]; ok {
                panic(fmt.Sprintf("discovery: Config named %q is already registered", name))
        }
        configNames[name] = config

        fieldName := configFieldPrefix + yamlKey // Field must be exported.
        configFieldNames[elemType] = fieldName

        // Insert fields in sorted order.
        i := sort.Search(len(configFields), func(k int) bool {
                return fieldName < configFields[k].Name
        })
        configFields = append(configFields, reflect.StructField{}) // Add empty field at end.
        copy(configFields[i+1:], configFields[i:])                 // Shift fields to the right.
        configFields[i] = reflect.StructField{                     // Write new field in place.
                Name: fieldName,
                Type: reflect.SliceOf(elemType),
                Tag:  reflect.StructTag(`yaml:"` + yamlKey + `,omitempty"`),
        }
}

func getConfigType(out reflect.Type) reflect.Type {
        configTypesMu.Lock()
        defer configTypesMu.Unlock()
        if typ, ok := configTypes[out]; ok {
                return typ
        }
        // Initial exported fields map one-to-one.
        var fields []reflect.StructField
        for i, n := 0, out.NumField(); i < n; i++ {
                switch field := out.Field(i); {
                case field.PkgPath == "" && field.Type != configsType:
                        fields = append(fields, field)
                default:
                        fields = append(fields, reflect.StructField{
                                Name:    "_" + field.Name, // Field must be unexported.
                                PkgPath: out.PkgPath(),
                                Type:    emptyStructType,
                        })
                }
        }
        // Append extra config fields on the end.
        fields = append(fields, configFields...)
        typ := reflect.StructOf(fields)
        configTypes[out] = typ
        return typ
}

// UnmarshalYAMLWithInlineConfigs helps implement yaml.Unmarshal for structs
// that have a Configs field that should be inlined.
func UnmarshalYAMLWithInlineConfigs(out interface{}, unmarshal func(interface{}) error) error {
        outVal := reflect.ValueOf(out)
        if outVal.Kind() != reflect.Ptr {
                return fmt.Errorf("discovery: can only unmarshal into a struct pointer: %T", out)
        }
        outVal = outVal.Elem()
        if outVal.Kind() != reflect.Struct {
                return fmt.Errorf("discovery: can only unmarshal into a struct pointer: %T", out)
        }
        outTyp := outVal.Type()

        cfgTyp := getConfigType(outTyp)
        cfgPtr := reflect.New(cfgTyp)
        cfgVal := cfgPtr.Elem()

        // Copy shared fields (defaults) to dynamic value.
        var configs *Configs
        for i, n := 0, outVal.NumField(); i < n; i++ {
                if outTyp.Field(i).Type == configsType {
                        configs = outVal.Field(i).Addr().Interface().(*Configs)
                        continue
                }
                if cfgTyp.Field(i).PkgPath != "" {
                        continue // Field is unexported: ignore.
                }
                cfgVal.Field(i).Set(outVal.Field(i))
        }
        if configs == nil {
                return fmt.Errorf("discovery: Configs field not found in type: %T", out)
        }

        // Unmarshal into dynamic value.
        if err := unmarshal(cfgPtr.Interface()); err != nil {
                return replaceYAMLTypeError(err, cfgTyp, outTyp)
        }

        // Copy shared fields from dynamic value.
        for i, n := 0, outVal.NumField(); i < n; i++ {
                if cfgTyp.Field(i).PkgPath != "" {
                        continue // Field is unexported: ignore.
                }
                outVal.Field(i).Set(cfgVal.Field(i))
        }

        var err error
        *configs, err = readConfigs(cfgVal, outVal.NumField())
        return err
}

func readConfigs(structVal reflect.Value, startField int) (Configs, error) {
        var (
                configs Configs
                targets []*targetgroup.Group
        )
        for i, n := startField, structVal.NumField(); i < n; i++ {
                field := structVal.Field(i)
                if field.Kind() != reflect.Slice {
                        panic("discovery: internal error: field is not a slice")
                }
                for k := 0; k < field.Len(); k++ {
                        val := field.Index(k)
                        if val.IsZero() || (val.Kind() == reflect.Ptr && val.Elem().IsZero()) {
                                key := configFieldNames[field.Type().Elem()]
                                key = strings.TrimPrefix(key, configFieldPrefix)
                                return nil, fmt.Errorf("empty or null section in %s", key)
                        }
                        switch c := val.Interface().(type) {
                        case *targetgroup.Group:
                                // Add index to the static config target groups for unique identification
                                // within scrape pool.
                                c.Source = strconv.Itoa(len(targets))
                                // Coalesce multiple static configs into a single static config.
                                targets = append(targets, c)
                        case Config:
                                configs = append(configs, c)
                        default:
                                panic("discovery: internal error: slice element is not a Config")
                        }
                }
        }
        if len(targets) > 0 {
                configs = append(configs, StaticConfig(targets))
        }
        return configs, nil
}

// MarshalYAMLWithInlineConfigs helps implement yaml.Marshal for structs
// that have a Configs field that should be inlined.
func MarshalYAMLWithInlineConfigs(in interface{}) (interface{}, error) {
        inVal := reflect.ValueOf(in)
        for inVal.Kind() == reflect.Ptr {
                inVal = inVal.Elem()
        }
        inTyp := inVal.Type()

        cfgTyp := getConfigType(inTyp)
        cfgPtr := reflect.New(cfgTyp)
        cfgVal := cfgPtr.Elem()

        // Copy shared fields to dynamic value.
        var configs *Configs
        for i, n := 0, inTyp.NumField(); i < n; i++ {
                if inTyp.Field(i).Type == configsType {
                        configs = inVal.Field(i).Addr().Interface().(*Configs)
                }
                if cfgTyp.Field(i).PkgPath != "" {
                        continue // Field is unexported: ignore.
                }
                cfgVal.Field(i).Set(inVal.Field(i))
        }
        if configs == nil {
                return nil, fmt.Errorf("discovery: Configs field not found in type: %T", in)
        }

        if err := writeConfigs(cfgVal, *configs); err != nil {
                return nil, err
        }

        return cfgPtr.Interface(), nil
}

func writeConfigs(structVal reflect.Value, configs Configs) error {
        targets := structVal.FieldByName(staticConfigsFieldName).Addr().Interface().(*[]*targetgroup.Group)
        for _, c := range configs {
                if sc, ok := c.(StaticConfig); ok {
                        *targets = append(*targets, sc...)
                        continue
                }
                fieldName, ok := configFieldNames[reflect.TypeOf(c)]
                if !ok {
                        return fmt.Errorf("discovery: cannot marshal unregistered Config type: %T", c)
                }
                field := structVal.FieldByName(fieldName)
                field.Set(reflect.Append(field, reflect.ValueOf(c)))
        }
        return nil
}

func replaceYAMLTypeError(err error, oldTyp, newTyp reflect.Type) error {
        var e *yaml.TypeError
        if errors.As(err, &e) {
                oldStr := oldTyp.String()
                newStr := newTyp.String()
                for i, s := range e.Errors {
                        e.Errors[i] = strings.ReplaceAll(s, oldStr, newStr)
                }
        }
        return err
}

// RegisterSDMetrics registers the metrics used by service discovery mechanisms.
// RegisterSDMetrics should be called only once during the lifetime of the Prometheus process.
// There is no need for the Prometheus process to unregister the metrics.
func RegisterSDMetrics(registerer prometheus.Registerer, rmm RefreshMetricsManager) (map[string]DiscovererMetrics, error) {
        err := rmm.Register()
        if err != nil {
                return nil, fmt.Errorf("failed to create service discovery refresh metrics")
        }

        metrics := make(map[string]DiscovererMetrics)
        for _, conf := range configNames {
                currentSdMetrics := conf.NewDiscovererMetrics(registerer, rmm)
                err = currentSdMetrics.Register()
                if err != nil {
                        return nil, fmt.Errorf("failed to create service discovery metrics")
                }
                metrics[conf.Name()] = currentSdMetrics
        }
        return metrics, nil
}

// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package targetgroup

import (
        "bytes"
        "encoding/json"

        "github.com/prometheus/common/model"
)

// Group is a set of targets with a common label set(production , test, staging etc.).
type Group struct {
        // Targets is a list of targets identified by a label set. Each target is
        // uniquely identifiable in the group by its address label.
        Targets []model.LabelSet
        // Labels is a set of labels that is common across all targets in the group.
        Labels model.LabelSet

        // Source is an identifier that describes a group of targets.
        Source string
}

func (tg Group) String() string {
        return tg.Source
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (tg *Group) UnmarshalYAML(unmarshal func(interface{}) error) error {
        g := struct {
                Targets []string       `yaml:"targets"`
                Labels  model.LabelSet `yaml:"labels"`
        }{}
        if err := unmarshal(&g); err != nil {
                return err
        }
        tg.Targets = make([]model.LabelSet, 0, len(g.Targets))
        for _, t := range g.Targets {
                tg.Targets = append(tg.Targets, model.LabelSet{
                        model.AddressLabel: model.LabelValue(t),
                })
        }
        tg.Labels = g.Labels
        return nil
}

// MarshalYAML implements the yaml.Marshaler interface.
func (tg Group) MarshalYAML() (interface{}, error) {
        g := &struct {
                Targets []string       `yaml:"targets"`
                Labels  model.LabelSet `yaml:"labels,omitempty"`
        }{
                Targets: make([]string, 0, len(tg.Targets)),
                Labels:  tg.Labels,
        }
        for _, t := range tg.Targets {
                g.Targets = append(g.Targets, string(t[model.AddressLabel]))
        }
        return g, nil
}

// UnmarshalJSON implements the json.Unmarshaler interface.
func (tg *Group) UnmarshalJSON(b []byte) error {
        g := struct {
                Targets []string       `json:"targets"`
                Labels  model.LabelSet `json:"labels"`
        }{}

        dec := json.NewDecoder(bytes.NewReader(b))
        dec.DisallowUnknownFields()
        if err := dec.Decode(&g); err != nil {
                return err
        }
        tg.Targets = make([]model.LabelSet, 0, len(g.Targets))
        for _, t := range g.Targets {
                tg.Targets = append(tg.Targets, model.LabelSet{
                        model.AddressLabel: model.LabelValue(t),
                })
        }
        tg.Labels = g.Labels
        return nil
}

// MarshalJSON implements the json.Marshaler interface.
func (tg Group) MarshalJSON() ([]byte, error) {
        g := &struct {
                Targets []string       `json:"targets"`
                Labels  model.LabelSet `json:"labels,omitempty"`
        }{
                Targets: make([]string, 0, len(tg.Targets)),
                Labels:  tg.Labels,
        }
        for _, t := range tg.Targets {
                g.Targets = append(g.Targets, string(t[model.AddressLabel]))
        }
        return json.Marshal(g)
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package discovery

import (
        "fmt"

        "github.com/prometheus/client_golang/prometheus"
)

// A utility to be used by implementations of discovery.Discoverer
// which need to manage the lifetime of their metrics.
type MetricRegisterer interface {
        RegisterMetrics() error
        UnregisterMetrics()
}

// metricRegistererImpl is an implementation of MetricRegisterer.
type metricRegistererImpl struct {
        reg     prometheus.Registerer
        metrics []prometheus.Collector
}

var _ MetricRegisterer = &metricRegistererImpl{}

// Creates an instance of a MetricRegisterer.
// Typically called inside the implementation of the NewDiscoverer() method.
func NewMetricRegisterer(reg prometheus.Registerer, metrics []prometheus.Collector) MetricRegisterer {
        return &metricRegistererImpl{
                reg:     reg,
                metrics: metrics,
        }
}

// RegisterMetrics registers the metrics with a Prometheus registerer.
// If any metric fails to register, it will unregister all metrics that
// were registered so far, and return an error.
// Typically called at the start of the SD's Run() method.
func (rh *metricRegistererImpl) RegisterMetrics() error {
        for _, collector := range rh.metrics {
                err := rh.reg.Register(collector)
                if err != nil {
                        // Unregister all metrics that were registered so far.
                        // This is so that if RegisterMetrics() gets called again,
                        // there will not be an error due to a duplicate registration.
                        rh.UnregisterMetrics()

                        return fmt.Errorf("failed to register metric: %w", err)
                }
        }
        return nil
}

// UnregisterMetrics unregisters the metrics from the same Prometheus
// registerer which was used to register them.
// Typically called at the end of the SD's Run() method by a defer statement.
func (rh *metricRegistererImpl) UnregisterMetrics() {
        for _, collector := range rh.metrics {
                rh.reg.Unregister(collector)
        }
}

// Copyright 2019 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package exemplar

import "github.com/prometheus/prometheus/model/labels"

// The combined length of the label names and values of an Exemplar's LabelSet MUST NOT exceed 128 UTF-8 characters
// https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#exemplars
const ExemplarMaxLabelSetLength = 128

// Exemplar is additional information associated with a time series.
type Exemplar struct {
        Labels labels.Labels `json:"labels"`
        Value  float64       `json:"value"`
        Ts     int64         `json:"timestamp"`
        HasTs  bool
}

type QueryResult struct {
        SeriesLabels labels.Labels `json:"seriesLabels"`
        Exemplars    []Exemplar    `json:"exemplars"`
}

// Equals compares if the exemplar e is the same as e2. Note that if HasTs is false for
// both exemplars then the timestamps will be ignored for the comparison. This can come up
// when an exemplar is exported without it's own timestamp, in which case the scrape timestamp
// is assigned to the Ts field. However we still want to treat the same exemplar, scraped without
// an exported timestamp, as a duplicate of itself for each subsequent scrape.
func (e Exemplar) Equals(e2 Exemplar) bool {
        if !labels.Equal(e.Labels, e2.Labels) {
                return false
        }

        if (e.HasTs || e2.HasTs) && e.Ts != e2.Ts {
                return false
        }

        return e.Value == e2.Value
}

// Sort first by timestamp, then value, then labels.
func Compare(a, b Exemplar) int {
        if a.Ts < b.Ts {
                return -1
        } else if a.Ts > b.Ts {
                return 1
        }
        if a.Value < b.Value {
                return -1
        } else if a.Value > b.Value {
                return 1
        }
        return labels.Compare(a.Labels, b.Labels)
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package histogram

import (
        "fmt"
        "math"
        "strings"
)

// FloatHistogram is similar to Histogram but uses float64 for all
// counts. Additionally, bucket counts are absolute and not deltas.
//
// A FloatHistogram is needed by PromQL to handle operations that might result
// in fractional counts. Since the counts in a histogram are unlikely to be too
// large to be represented precisely by a float64, a FloatHistogram can also be
// used to represent a histogram with integer counts and thus serves as a more
// generalized representation.
type FloatHistogram struct {
        // Counter reset information.
        CounterResetHint CounterResetHint
        // Currently valid schema numbers are -4 <= n <= 8 for exponential buckets.
        // They are all for base-2 bucket schemas, where 1 is a bucket boundary in
        // each case, and then each power of two is divided into 2^n logarithmic buckets.
        // Or in other words, each bucket boundary is the previous boundary times
        // 2^(2^-n). Another valid schema number is -53 for custom buckets, defined by
        // the CustomValues field.
        Schema int32
        // Width of the zero bucket.
        ZeroThreshold float64
        // Observations falling into the zero bucket. Must be zero or positive.
        ZeroCount float64
        // Total number of observations. Must be zero or positive.
        Count float64
        // Sum of observations. This is also used as the stale marker.
        Sum float64
        // Spans for positive and negative buckets (see Span below).
        PositiveSpans, NegativeSpans []Span
        // Observation counts in buckets. Each represents an absolute count and
        // must be zero or positive.
        PositiveBuckets, NegativeBuckets []float64
        // Holds the custom (usually upper) bounds for bucket definitions, otherwise nil.
        // This slice is interned, to be treated as immutable and copied by reference.
        // These numbers should be strictly increasing. This field is only used when the
        // schema is for custom buckets, and the ZeroThreshold, ZeroCount, NegativeSpans
        // and NegativeBuckets fields are not used in that case.
        CustomValues []float64
}

func (h *FloatHistogram) UsesCustomBuckets() bool {
        return IsCustomBucketsSchema(h.Schema)
}

// Copy returns a deep copy of the Histogram.
func (h *FloatHistogram) Copy() *FloatHistogram {
        c := FloatHistogram{
                CounterResetHint: h.CounterResetHint,
                Schema:           h.Schema,
                Count:            h.Count,
                Sum:              h.Sum,
        }

        if h.UsesCustomBuckets() {
                if len(h.CustomValues) != 0 {
                        c.CustomValues = make([]float64, len(h.CustomValues))
                        copy(c.CustomValues, h.CustomValues)
                }
        } else {
                c.ZeroThreshold = h.ZeroThreshold
                c.ZeroCount = h.ZeroCount

                if len(h.NegativeSpans) != 0 {
                        c.NegativeSpans = make([]Span, len(h.NegativeSpans))
                        copy(c.NegativeSpans, h.NegativeSpans)
                }
                if len(h.NegativeBuckets) != 0 {
                        c.NegativeBuckets = make([]float64, len(h.NegativeBuckets))
                        copy(c.NegativeBuckets, h.NegativeBuckets)
                }
        }

        if len(h.PositiveSpans) != 0 {
                c.PositiveSpans = make([]Span, len(h.PositiveSpans))
                copy(c.PositiveSpans, h.PositiveSpans)
        }
        if len(h.PositiveBuckets) != 0 {
                c.PositiveBuckets = make([]float64, len(h.PositiveBuckets))
                copy(c.PositiveBuckets, h.PositiveBuckets)
        }

        return &c
}

// CopyTo makes a deep copy into the given FloatHistogram.
// The destination object has to be a non-nil pointer.
func (h *FloatHistogram) CopyTo(to *FloatHistogram) {
        to.CounterResetHint = h.CounterResetHint
        to.Schema = h.Schema
        to.Count = h.Count
        to.Sum = h.Sum

        if h.UsesCustomBuckets() {
                to.ZeroThreshold = 0
                to.ZeroCount = 0

                to.NegativeSpans = clearIfNotNil(to.NegativeSpans)
                to.NegativeBuckets = clearIfNotNil(to.NegativeBuckets)

                to.CustomValues = resize(to.CustomValues, len(h.CustomValues))
                copy(to.CustomValues, h.CustomValues)
        } else {
                to.ZeroThreshold = h.ZeroThreshold
                to.ZeroCount = h.ZeroCount

                to.NegativeSpans = resize(to.NegativeSpans, len(h.NegativeSpans))
                copy(to.NegativeSpans, h.NegativeSpans)

                to.NegativeBuckets = resize(to.NegativeBuckets, len(h.NegativeBuckets))
                copy(to.NegativeBuckets, h.NegativeBuckets)

                to.CustomValues = clearIfNotNil(to.CustomValues)
        }

        to.PositiveSpans = resize(to.PositiveSpans, len(h.PositiveSpans))
        copy(to.PositiveSpans, h.PositiveSpans)

        to.PositiveBuckets = resize(to.PositiveBuckets, len(h.PositiveBuckets))
        copy(to.PositiveBuckets, h.PositiveBuckets)
}

// CopyToSchema works like Copy, but the returned deep copy has the provided
// target schema, which must be ≤ the original schema (i.e. it must have a lower
// resolution). This method panics if a custom buckets schema is used in the
// receiving FloatHistogram or as the provided targetSchema.
func (h *FloatHistogram) CopyToSchema(targetSchema int32) *FloatHistogram {
        if targetSchema == h.Schema {
                // Fast path.
                return h.Copy()
        }
        if h.UsesCustomBuckets() {
                panic(fmt.Errorf("cannot reduce resolution to %d when there are custom buckets", targetSchema))
        }
        if IsCustomBucketsSchema(targetSchema) {
                panic("cannot reduce resolution to custom buckets schema")
        }
        if targetSchema > h.Schema {
                panic(fmt.Errorf("cannot copy from schema %d to %d", h.Schema, targetSchema))
        }
        c := FloatHistogram{
                Schema:        targetSchema,
                ZeroThreshold: h.ZeroThreshold,
                ZeroCount:     h.ZeroCount,
                Count:         h.Count,
                Sum:           h.Sum,
        }

        c.PositiveSpans, c.PositiveBuckets = reduceResolution(h.PositiveSpans, h.PositiveBuckets, h.Schema, targetSchema, false, false)
        c.NegativeSpans, c.NegativeBuckets = reduceResolution(h.NegativeSpans, h.NegativeBuckets, h.Schema, targetSchema, false, false)

        return &c
}

// String returns a string representation of the Histogram.
func (h *FloatHistogram) String() string {
        var sb strings.Builder
        fmt.Fprintf(&sb, "{count:%g, sum:%g", h.Count, h.Sum)

        var nBuckets []Bucket[float64]
        for it := h.NegativeBucketIterator(); it.Next(); {
                bucket := it.At()
                if bucket.Count != 0 {
                        nBuckets = append(nBuckets, it.At())
                }
        }
        for i := len(nBuckets) - 1; i >= 0; i-- {
                fmt.Fprintf(&sb, ", %s", nBuckets[i].String())
        }

        if h.ZeroCount != 0 {
                fmt.Fprintf(&sb, ", %s", h.ZeroBucket().String())
        }

        for it := h.PositiveBucketIterator(); it.Next(); {
                bucket := it.At()
                if bucket.Count != 0 {
                        fmt.Fprintf(&sb, ", %s", bucket.String())
                }
        }

        sb.WriteRune('}')
        return sb.String()
}

// TestExpression returns the string representation of this histogram as it is used in the internal PromQL testing
// framework as well as in promtool rules unit tests.
// The syntax is described in https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#series
func (h *FloatHistogram) TestExpression() string {
        var res []string
        m := h.Copy()

        m.Compact(math.MaxInt) // Compact to reduce the number of positive and negative spans to 1.

        if m.Schema != 0 {
                res = append(res, fmt.Sprintf("schema:%d", m.Schema))
        }
        if m.Count != 0 {
                res = append(res, fmt.Sprintf("count:%g", m.Count))
        }
        if m.Sum != 0 {
                res = append(res, fmt.Sprintf("sum:%g", m.Sum))
        }
        if m.ZeroCount != 0 {
                res = append(res, fmt.Sprintf("z_bucket:%g", m.ZeroCount))
        }
        if m.ZeroThreshold != 0 {
                res = append(res, fmt.Sprintf("z_bucket_w:%g", m.ZeroThreshold))
        }
        if m.UsesCustomBuckets() {
                res = append(res, fmt.Sprintf("custom_values:%g", m.CustomValues))
        }

        addBuckets := func(kind, bucketsKey, offsetKey string, buckets []float64, spans []Span) []string {
                if len(spans) > 1 {
                        panic(fmt.Sprintf("histogram with multiple %s spans not supported", kind))
                }
                for _, span := range spans {
                        if span.Offset != 0 {
                                res = append(res, fmt.Sprintf("%s:%d", offsetKey, span.Offset))
                        }
                }

                var bucketStr []string
                for _, bucket := range buckets {
                        bucketStr = append(bucketStr, fmt.Sprintf("%g", bucket))
                }
                if len(bucketStr) > 0 {
                        res = append(res, fmt.Sprintf("%s:[%s]", bucketsKey, strings.Join(bucketStr, " ")))
                }
                return res
        }
        res = addBuckets("positive", "buckets", "offset", m.PositiveBuckets, m.PositiveSpans)
        res = addBuckets("negative", "n_buckets", "n_offset", m.NegativeBuckets, m.NegativeSpans)
        return "{{" + strings.Join(res, " ") + "}}"
}

// ZeroBucket returns the zero bucket. This method panics if the schema is for custom buckets.
func (h *FloatHistogram) ZeroBucket() Bucket[float64] {
        if h.UsesCustomBuckets() {
                panic("histograms with custom buckets have no zero bucket")
        }
        return Bucket[float64]{
                Lower:          -h.ZeroThreshold,
                Upper:          h.ZeroThreshold,
                LowerInclusive: true,
                UpperInclusive: true,
                Count:          h.ZeroCount,
                // Index is irrelevant for the zero bucket.
        }
}

// Mul multiplies the FloatHistogram by the provided factor, i.e. it scales all
// bucket counts including the zero bucket and the count and the sum of
// observations. The bucket layout stays the same. This method changes the
// receiving histogram directly (rather than acting on a copy). It returns a
// pointer to the receiving histogram for convenience.
func (h *FloatHistogram) Mul(factor float64) *FloatHistogram {
        h.ZeroCount *= factor
        h.Count *= factor
        h.Sum *= factor
        for i := range h.PositiveBuckets {
                h.PositiveBuckets[i] *= factor
        }
        for i := range h.NegativeBuckets {
                h.NegativeBuckets[i] *= factor
        }
        return h
}

// Div works like Mul but divides instead of multiplies.
// When dividing by 0, everything will be set to Inf.
func (h *FloatHistogram) Div(scalar float64) *FloatHistogram {
        h.ZeroCount /= scalar
        h.Count /= scalar
        h.Sum /= scalar
        for i := range h.PositiveBuckets {
                h.PositiveBuckets[i] /= scalar
        }
        for i := range h.NegativeBuckets {
                h.NegativeBuckets[i] /= scalar
        }
        return h
}

// Add adds the provided other histogram to the receiving histogram. Count, Sum,
// and buckets from the other histogram are added to the corresponding
// components of the receiving histogram. Buckets in the other histogram that do
// not exist in the receiving histogram are inserted into the latter. The
// resulting histogram might have buckets with a population of zero or directly
// adjacent spans (offset=0). To normalize those, call the Compact method.
//
// The method reconciles differences in the zero threshold and in the schema, and
// changes them if needed. The other histogram will not be modified in any case.
// Adding is currently only supported between 2 exponential histograms, or between
// 2 custom buckets histograms with the exact same custom bounds.
//
// This method returns a pointer to the receiving histogram for convenience.
func (h *FloatHistogram) Add(other *FloatHistogram) (*FloatHistogram, error) {
        if h.UsesCustomBuckets() != other.UsesCustomBuckets() {
                return nil, ErrHistogramsIncompatibleSchema
        }
        if h.UsesCustomBuckets() && !FloatBucketsMatch(h.CustomValues, other.CustomValues) {
                return nil, ErrHistogramsIncompatibleBounds
        }

        switch {
        case other.CounterResetHint == h.CounterResetHint:
                // Adding apples to apples, all good. No need to change anything.
        case h.CounterResetHint == GaugeType:
                // Adding something else to a gauge. That's probably OK. Outcome is a gauge.
                // Nothing to do since the receiver is already marked as gauge.
        case other.CounterResetHint == GaugeType:
                // Similar to before, but this time the receiver is "something else" and we have to change it to gauge.
                h.CounterResetHint = GaugeType
        case h.CounterResetHint == UnknownCounterReset:
                // With the receiver's CounterResetHint being "unknown", this could still be legitimate
                // if the caller knows what they are doing. Outcome is then again "unknown".
                // No need to do anything since the receiver's CounterResetHint is already "unknown".
        case other.CounterResetHint == UnknownCounterReset:
                // Similar to before, but now we have to set the receiver's CounterResetHint to "unknown".
                h.CounterResetHint = UnknownCounterReset
        default:
                // All other cases shouldn't actually happen.
                // They are a direct collision of CounterReset and NotCounterReset.
                // Conservatively set the CounterResetHint to "unknown" and isse a warning.
                h.CounterResetHint = UnknownCounterReset
                // TODO(trevorwhitney): Actually issue the warning as soon as the plumbing for it is in place
        }

        if !h.UsesCustomBuckets() {
                otherZeroCount := h.reconcileZeroBuckets(other)
                h.ZeroCount += otherZeroCount
        }
        h.Count += other.Count
        h.Sum += other.Sum

        var (
                hPositiveSpans       = h.PositiveSpans
                hPositiveBuckets     = h.PositiveBuckets
                otherPositiveSpans   = other.PositiveSpans
                otherPositiveBuckets = other.PositiveBuckets
        )

        if h.UsesCustomBuckets() {
                h.PositiveSpans, h.PositiveBuckets = addBuckets(h.Schema, h.ZeroThreshold, false, hPositiveSpans, hPositiveBuckets, otherPositiveSpans, otherPositiveBuckets)
                return h, nil
        }

        var (
                hNegativeSpans       = h.NegativeSpans
                hNegativeBuckets     = h.NegativeBuckets
                otherNegativeSpans   = other.NegativeSpans
                otherNegativeBuckets = other.NegativeBuckets
        )

        switch {
        case other.Schema < h.Schema:
                hPositiveSpans, hPositiveBuckets = reduceResolution(hPositiveSpans, hPositiveBuckets, h.Schema, other.Schema, false, true)
                hNegativeSpans, hNegativeBuckets = reduceResolution(hNegativeSpans, hNegativeBuckets, h.Schema, other.Schema, false, true)
                h.Schema = other.Schema

        case other.Schema > h.Schema:
                otherPositiveSpans, otherPositiveBuckets = reduceResolution(otherPositiveSpans, otherPositiveBuckets, other.Schema, h.Schema, false, false)
                otherNegativeSpans, otherNegativeBuckets = reduceResolution(otherNegativeSpans, otherNegativeBuckets, other.Schema, h.Schema, false, false)
        }

        h.PositiveSpans, h.PositiveBuckets = addBuckets(h.Schema, h.ZeroThreshold, false, hPositiveSpans, hPositiveBuckets, otherPositiveSpans, otherPositiveBuckets)
        h.NegativeSpans, h.NegativeBuckets = addBuckets(h.Schema, h.ZeroThreshold, false, hNegativeSpans, hNegativeBuckets, otherNegativeSpans, otherNegativeBuckets)

        return h, nil
}

// Sub works like Add but subtracts the other histogram.
func (h *FloatHistogram) Sub(other *FloatHistogram) (*FloatHistogram, error) {
        if h.UsesCustomBuckets() != other.UsesCustomBuckets() {
                return nil, ErrHistogramsIncompatibleSchema
        }
        if h.UsesCustomBuckets() && !FloatBucketsMatch(h.CustomValues, other.CustomValues) {
                return nil, ErrHistogramsIncompatibleBounds
        }

        if !h.UsesCustomBuckets() {
                otherZeroCount := h.reconcileZeroBuckets(other)
                h.ZeroCount -= otherZeroCount
        }
        h.Count -= other.Count
        h.Sum -= other.Sum

        var (
                hPositiveSpans       = h.PositiveSpans
                hPositiveBuckets     = h.PositiveBuckets
                otherPositiveSpans   = other.PositiveSpans
                otherPositiveBuckets = other.PositiveBuckets
        )

        if h.UsesCustomBuckets() {
                h.PositiveSpans, h.PositiveBuckets = addBuckets(h.Schema, h.ZeroThreshold, true, hPositiveSpans, hPositiveBuckets, otherPositiveSpans, otherPositiveBuckets)
                return h, nil
        }

        var (
                hNegativeSpans       = h.NegativeSpans
                hNegativeBuckets     = h.NegativeBuckets
                otherNegativeSpans   = other.NegativeSpans
                otherNegativeBuckets = other.NegativeBuckets
        )

        switch {
        case other.Schema < h.Schema:
                hPositiveSpans, hPositiveBuckets = reduceResolution(hPositiveSpans, hPositiveBuckets, h.Schema, other.Schema, false, true)
                hNegativeSpans, hNegativeBuckets = reduceResolution(hNegativeSpans, hNegativeBuckets, h.Schema, other.Schema, false, true)
                h.Schema = other.Schema
        case other.Schema > h.Schema:
                otherPositiveSpans, otherPositiveBuckets = reduceResolution(otherPositiveSpans, otherPositiveBuckets, other.Schema, h.Schema, false, false)
                otherNegativeSpans, otherNegativeBuckets = reduceResolution(otherNegativeSpans, otherNegativeBuckets, other.Schema, h.Schema, false, false)
        }

        h.PositiveSpans, h.PositiveBuckets = addBuckets(h.Schema, h.ZeroThreshold, true, hPositiveSpans, hPositiveBuckets, otherPositiveSpans, otherPositiveBuckets)
        h.NegativeSpans, h.NegativeBuckets = addBuckets(h.Schema, h.ZeroThreshold, true, hNegativeSpans, hNegativeBuckets, otherNegativeSpans, otherNegativeBuckets)

        return h, nil
}

// Equals returns true if the given float histogram matches exactly.
// Exact match is when there are no new buckets (even empty) and no missing buckets,
// and all the bucket values match. Spans can have different empty length spans in between,
// but they must represent the same bucket layout to match.
// Sum, Count, ZeroCount and bucket values are compared based on their bit patterns
// because this method is about data equality rather than mathematical equality.
// We ignore fields that are not used based on the exponential / custom buckets schema,
// but check fields where differences may cause unintended behaviour even if they are not
// supposed to be used according to the schema.
func (h *FloatHistogram) Equals(h2 *FloatHistogram) bool {
        if h2 == nil {
                return false
        }

        if h.Schema != h2.Schema ||
                math.Float64bits(h.Count) != math.Float64bits(h2.Count) ||
                math.Float64bits(h.Sum) != math.Float64bits(h2.Sum) {
                return false
        }

        if h.UsesCustomBuckets() {
                if !FloatBucketsMatch(h.CustomValues, h2.CustomValues) {
                        return false
                }
        }

        if h.ZeroThreshold != h2.ZeroThreshold ||
                math.Float64bits(h.ZeroCount) != math.Float64bits(h2.ZeroCount) {
                return false
        }

        if !spansMatch(h.NegativeSpans, h2.NegativeSpans) {
                return false
        }
        if !FloatBucketsMatch(h.NegativeBuckets, h2.NegativeBuckets) {
                return false
        }

        if !spansMatch(h.PositiveSpans, h2.PositiveSpans) {
                return false
        }
        if !FloatBucketsMatch(h.PositiveBuckets, h2.PositiveBuckets) {
                return false
        }

        return true
}

// Size returns the total size of the FloatHistogram, which includes the size of the pointer
// to FloatHistogram, all its fields, and all elements contained in slices.
// NOTE: this is only valid for 64 bit architectures.
func (h *FloatHistogram) Size() int {
        // Size of each slice separately.
        posSpanSize := len(h.PositiveSpans) * 8     // 8 bytes (int32 + uint32).
        negSpanSize := len(h.NegativeSpans) * 8     // 8 bytes (int32 + uint32).
        posBucketSize := len(h.PositiveBuckets) * 8 // 8 bytes (float64).
        negBucketSize := len(h.NegativeBuckets) * 8 // 8 bytes (float64).
        customBoundSize := len(h.CustomValues) * 8  // 8 bytes (float64).

        // Total size of the struct.

        // fh is 8 bytes.
        // fh.CounterResetHint is 4 bytes (1 byte bool + 3 bytes padding).
        // fh.Schema is 4 bytes.
        // fh.ZeroThreshold is 8 bytes.
        // fh.ZeroCount is 8 bytes.
        // fh.Count is 8 bytes.
        // fh.Sum is 8 bytes.
        // fh.PositiveSpans is 24 bytes.
        // fh.NegativeSpans is 24 bytes.
        // fh.PositiveBuckets is 24 bytes.
        // fh.NegativeBuckets is 24 bytes.
        // fh.CustomValues is 24 bytes.
        structSize := 168

        return structSize + posSpanSize + negSpanSize + posBucketSize + negBucketSize + customBoundSize
}

// Compact eliminates empty buckets at the beginning and end of each span, then
// merges spans that are consecutive or at most maxEmptyBuckets apart, and
// finally splits spans that contain more consecutive empty buckets than
// maxEmptyBuckets. (The actual implementation might do something more efficient
// but with the same result.)  The compaction happens "in place" in the
// receiving histogram, but a pointer to it is returned for convenience.
//
// The ideal value for maxEmptyBuckets depends on circumstances. The motivation
// to set maxEmptyBuckets > 0 is the assumption that is less overhead to
// represent very few empty buckets explicitly within one span than cutting the
// one span into two to treat the empty buckets as a gap between the two spans,
// both in terms of storage requirement as well as in terms of encoding and
// decoding effort. However, the tradeoffs are subtle. For one, they are
// different in the exposition format vs. in a TSDB chunk vs. for the in-memory
// representation as Go types. In the TSDB, as an additional aspects, the span
// layout is only stored once per chunk, while many histograms with that same
// chunk layout are then only stored with their buckets (so that even a single
// empty bucket will be stored many times).
//
// For the Go types, an additional Span takes 8 bytes. Similarly, an additional
// bucket takes 8 bytes. Therefore, with a single separating empty bucket, both
// options have the same storage requirement, but the single-span solution is
// easier to iterate through. Still, the safest bet is to use maxEmptyBuckets==0
// and only use a larger number if you know what you are doing.
func (h *FloatHistogram) Compact(maxEmptyBuckets int) *FloatHistogram {
        h.PositiveBuckets, h.PositiveSpans = compactBuckets(
                h.PositiveBuckets, h.PositiveSpans, maxEmptyBuckets, false,
        )
        h.NegativeBuckets, h.NegativeSpans = compactBuckets(
                h.NegativeBuckets, h.NegativeSpans, maxEmptyBuckets, false,
        )
        return h
}

// DetectReset returns true if the receiving histogram is missing any buckets
// that have a non-zero population in the provided previous histogram. It also
// returns true if any count (in any bucket, in the zero count, or in the count
// of observations, but NOT the sum of observations) is smaller in the receiving
// histogram compared to the previous histogram. Otherwise, it returns false.
//
// This method will shortcut to true if a CounterReset is detected, and shortcut
// to false if NotCounterReset is detected. Otherwise it will do the work to detect
// a reset.
//
// Special behavior in case the Schema or the ZeroThreshold are not the same in
// both histograms:
//
//   - A decrease of the ZeroThreshold or an increase of the Schema (i.e. an
//     increase of resolution) can only happen together with a reset. Thus, the
//     method returns true in either case.
//
//   - Upon an increase of the ZeroThreshold, the buckets in the previous
//     histogram that fall within the new ZeroThreshold are added to the ZeroCount
//     of the previous histogram (without mutating the provided previous
//     histogram). The scenario that a populated bucket of the previous histogram
//     is partially within, partially outside of the new ZeroThreshold, can only
//     happen together with a counter reset and therefore shortcuts to returning
//     true.
//
//   - Upon a decrease of the Schema, the buckets of the previous histogram are
//     merged so that they match the new, lower-resolution schema (again without
//     mutating the provided previous histogram).
func (h *FloatHistogram) DetectReset(previous *FloatHistogram) bool {
        if h.CounterResetHint == CounterReset {
                return true
        }
        if h.CounterResetHint == NotCounterReset {
                return false
        }
        // In all other cases of CounterResetHint (UnknownCounterReset and GaugeType),
        // we go on as we would otherwise, for reasons explained below.
        //
        // If the CounterResetHint is UnknownCounterReset, we do not know yet if this histogram comes
        // with a counter reset. Therefore, we have to do all the detailed work to find out if there
        // is a counter reset or not.
        // We do the same if the CounterResetHint is GaugeType, which should not happen, but PromQL still
        // allows the user to apply functions to gauge histograms that are only meant for counter histograms.
        // In this case, we treat the gauge histograms as counter histograms. A warning should be returned
        // to the user in this case.
        if h.Count < previous.Count {
                return true
        }
        if h.UsesCustomBuckets() != previous.UsesCustomBuckets() || (h.UsesCustomBuckets() && !FloatBucketsMatch(h.CustomValues, previous.CustomValues)) {
                // Mark that something has changed or that the application has been restarted. However, this does
                // not matter so much since the change in schema will be handled directly in the chunks and PromQL
                // functions.
                return true
        }
        if h.Schema > previous.Schema {
                return true
        }
        if h.ZeroThreshold < previous.ZeroThreshold {
                // ZeroThreshold decreased.
                return true
        }
        previousZeroCount, newThreshold := previous.zeroCountForLargerThreshold(h.ZeroThreshold)
        if newThreshold != h.ZeroThreshold {
                // ZeroThreshold is within a populated bucket in previous
                // histogram.
                return true
        }
        if h.ZeroCount < previousZeroCount {
                return true
        }
        currIt := h.floatBucketIterator(true, h.ZeroThreshold, h.Schema)
        prevIt := previous.floatBucketIterator(true, h.ZeroThreshold, h.Schema)
        if detectReset(&currIt, &prevIt) {
                return true
        }
        currIt = h.floatBucketIterator(false, h.ZeroThreshold, h.Schema)
        prevIt = previous.floatBucketIterator(false, h.ZeroThreshold, h.Schema)
        return detectReset(&currIt, &prevIt)
}

func detectReset(currIt, prevIt *floatBucketIterator) bool {
        if !prevIt.Next() {
                return false // If no buckets in previous histogram, nothing can be reset.
        }
        prevBucket := prevIt.strippedAt()
        if !currIt.Next() {
                // No bucket in current, but at least one in previous
                // histogram. Check if any of those are non-zero, in which case
                // this is a reset.
                for {
                        if prevBucket.count != 0 {
                                return true
                        }
                        if !prevIt.Next() {
                                return false
                        }
                }
        }
        currBucket := currIt.strippedAt()
        for {
                // Forward currIt until we find the bucket corresponding to prevBucket.
                for currBucket.index < prevBucket.index {
                        if !currIt.Next() {
                                // Reached end of currIt early, therefore
                                // previous histogram has a bucket that the
                                // current one does not have. Unlass all
                                // remaining buckets in the previous histogram
                                // are unpopulated, this is a reset.
                                for {
                                        if prevBucket.count != 0 {
                                                return true
                                        }
                                        if !prevIt.Next() {
                                                return false
                                        }
                                }
                        }
                        currBucket = currIt.strippedAt()
                }
                if currBucket.index > prevBucket.index {
                        // Previous histogram has a bucket the current one does
                        // not have. If it's populated, it's a reset.
                        if prevBucket.count != 0 {
                                return true
                        }
                } else {
                        // We have reached corresponding buckets in both iterators.
                        // We can finally compare the counts.
                        if currBucket.count < prevBucket.count {
                                return true
                        }
                }
                if !prevIt.Next() {
                        // Reached end of prevIt without finding offending buckets.
                        return false
                }
                prevBucket = prevIt.strippedAt()
        }
}

// PositiveBucketIterator returns a BucketIterator to iterate over all positive
// buckets in ascending order (starting next to the zero bucket and going up).
func (h *FloatHistogram) PositiveBucketIterator() BucketIterator[float64] {
        it := h.floatBucketIterator(true, 0, h.Schema)
        return &it
}

// NegativeBucketIterator returns a BucketIterator to iterate over all negative
// buckets in descending order (starting next to the zero bucket and going
// down).
func (h *FloatHistogram) NegativeBucketIterator() BucketIterator[float64] {
        it := h.floatBucketIterator(false, 0, h.Schema)
        return &it
}

// PositiveReverseBucketIterator returns a BucketIterator to iterate over all
// positive buckets in descending order (starting at the highest bucket and
// going down towards the zero bucket).
func (h *FloatHistogram) PositiveReverseBucketIterator() BucketIterator[float64] {
        it := newReverseFloatBucketIterator(h.PositiveSpans, h.PositiveBuckets, h.Schema, true, h.CustomValues)
        return &it
}

// NegativeReverseBucketIterator returns a BucketIterator to iterate over all
// negative buckets in ascending order (starting at the lowest bucket and going
// up towards the zero bucket).
func (h *FloatHistogram) NegativeReverseBucketIterator() BucketIterator[float64] {
        it := newReverseFloatBucketIterator(h.NegativeSpans, h.NegativeBuckets, h.Schema, false, nil)
        return &it
}

// AllBucketIterator returns a BucketIterator to iterate over all negative,
// zero, and positive buckets in ascending order (starting at the lowest bucket
// and going up). If the highest negative bucket or the lowest positive bucket
// overlap with the zero bucket, their upper or lower boundary, respectively, is
// set to the zero threshold.
func (h *FloatHistogram) AllBucketIterator() BucketIterator[float64] {
        return &allFloatBucketIterator{
                h:         h,
                leftIter:  newReverseFloatBucketIterator(h.NegativeSpans, h.NegativeBuckets, h.Schema, false, nil),
                rightIter: h.floatBucketIterator(true, 0, h.Schema),
                state:     -1,
        }
}

// AllReverseBucketIterator returns a BucketIterator to iterate over all negative,
// zero, and positive buckets in descending order (starting at the lowest bucket
// and going up). If the highest negative bucket or the lowest positive bucket
// overlap with the zero bucket, their upper or lower boundary, respectively, is
// set to the zero threshold.
func (h *FloatHistogram) AllReverseBucketIterator() BucketIterator[float64] {
        return &allFloatBucketIterator{
                h:         h,
                leftIter:  newReverseFloatBucketIterator(h.PositiveSpans, h.PositiveBuckets, h.Schema, true, h.CustomValues),
                rightIter: h.floatBucketIterator(false, 0, h.Schema),
                state:     -1,
        }
}

// Validate validates consistency between span and bucket slices. Also, buckets are checked
// against negative values. We check to make sure there are no unexpected fields or field values
// based on the exponential / custom buckets schema.
// We do not check for h.Count being at least as large as the sum of the
// counts in the buckets because floating point precision issues can
// create false positives here.
func (h *FloatHistogram) Validate() error {
        var nCount, pCount float64
        if h.UsesCustomBuckets() {
                if err := checkHistogramCustomBounds(h.CustomValues, h.PositiveSpans, len(h.PositiveBuckets)); err != nil {
                        return fmt.Errorf("custom buckets: %w", err)
                }
                if h.ZeroCount != 0 {
                        return fmt.Errorf("custom buckets: must have zero count of 0")
                }
                if h.ZeroThreshold != 0 {
                        return fmt.Errorf("custom buckets: must have zero threshold of 0")
                }
                if len(h.NegativeSpans) > 0 {
                        return fmt.Errorf("custom buckets: must not have negative spans")
                }
                if len(h.NegativeBuckets) > 0 {
                        return fmt.Errorf("custom buckets: must not have negative buckets")
                }
        } else {
                if err := checkHistogramSpans(h.PositiveSpans, len(h.PositiveBuckets)); err != nil {
                        return fmt.Errorf("positive side: %w", err)
                }
                if err := checkHistogramSpans(h.NegativeSpans, len(h.NegativeBuckets)); err != nil {
                        return fmt.Errorf("negative side: %w", err)
                }
                err := checkHistogramBuckets(h.NegativeBuckets, &nCount, false)
                if err != nil {
                        return fmt.Errorf("negative side: %w", err)
                }
                if h.CustomValues != nil {
                        return fmt.Errorf("histogram with exponential schema must not have custom bounds")
                }
        }
        err := checkHistogramBuckets(h.PositiveBuckets, &pCount, false)
        if err != nil {
                return fmt.Errorf("positive side: %w", err)
        }

        return nil
}

// zeroCountForLargerThreshold returns what the histogram's zero count would be
// if the ZeroThreshold had the provided larger (or equal) value. If the
// provided value is less than the histogram's ZeroThreshold, the method panics.
// If the largerThreshold ends up within a populated bucket of the histogram, it
// is adjusted upwards to the lower limit of that bucket (all in terms of
// absolute values) and that bucket's count is included in the returned
// count. The adjusted threshold is returned, too.
func (h *FloatHistogram) zeroCountForLargerThreshold(largerThreshold float64) (count, threshold float64) {
        // Fast path.
        if largerThreshold == h.ZeroThreshold {
                return h.ZeroCount, largerThreshold
        }
        if largerThreshold < h.ZeroThreshold {
                panic(fmt.Errorf("new threshold %f is less than old threshold %f", largerThreshold, h.ZeroThreshold))
        }
outer:
        for {
                count = h.ZeroCount
                i := h.PositiveBucketIterator()
                for i.Next() {
                        b := i.At()
                        if b.Lower >= largerThreshold {
                                break
                        }
                        count += b.Count // Bucket to be merged into zero bucket.
                        if b.Upper > largerThreshold {
                                // New threshold ended up within a bucket. if it's
                                // populated, we need to adjust largerThreshold before
                                // we are done here.
                                if b.Count != 0 {
                                        largerThreshold = b.Upper
                                }
                                break
                        }
                }
                i = h.NegativeBucketIterator()
                for i.Next() {
                        b := i.At()
                        if b.Upper <= -largerThreshold {
                                break
                        }
                        count += b.Count // Bucket to be merged into zero bucket.
                        if b.Lower < -largerThreshold {
                                // New threshold ended up within a bucket. If
                                // it's populated, we need to adjust
                                // largerThreshold and have to redo the whole
                                // thing because the treatment of the positive
                                // buckets is invalid now.
                                if b.Count != 0 {
                                        largerThreshold = -b.Lower
                                        continue outer
                                }
                                break
                        }
                }
                return count, largerThreshold
        }
}

// trimBucketsInZeroBucket removes all buckets that are within the zero
// bucket. It assumes that the zero threshold is at a bucket boundary and that
// the counts in the buckets to remove are already part of the zero count.
func (h *FloatHistogram) trimBucketsInZeroBucket() {
        i := h.PositiveBucketIterator()
        bucketsIdx := 0
        for i.Next() {
                b := i.At()
                if b.Lower >= h.ZeroThreshold {
                        break
                }
                h.PositiveBuckets[bucketsIdx] = 0
                bucketsIdx++
        }
        i = h.NegativeBucketIterator()
        bucketsIdx = 0
        for i.Next() {
                b := i.At()
                if b.Upper <= -h.ZeroThreshold {
                        break
                }
                h.NegativeBuckets[bucketsIdx] = 0
                bucketsIdx++
        }
        // We are abusing Compact to trim the buckets set to zero
        // above. Premature compacting could cause additional cost, but this
        // code path is probably rarely used anyway.
        h.Compact(0)
}

// reconcileZeroBuckets finds a zero bucket large enough to include the zero
// buckets of both histograms (the receiving histogram and the other histogram)
// with a zero threshold that is not within a populated bucket in either
// histogram. This method modifies the receiving histogram accourdingly, but
// leaves the other histogram as is. Instead, it returns the zero count the
// other histogram would have if it were modified.
func (h *FloatHistogram) reconcileZeroBuckets(other *FloatHistogram) float64 {
        otherZeroCount := other.ZeroCount
        otherZeroThreshold := other.ZeroThreshold

        for otherZeroThreshold != h.ZeroThreshold {
                if h.ZeroThreshold > otherZeroThreshold {
                        otherZeroCount, otherZeroThreshold = other.zeroCountForLargerThreshold(h.ZeroThreshold)
                }
                if otherZeroThreshold > h.ZeroThreshold {
                        h.ZeroCount, h.ZeroThreshold = h.zeroCountForLargerThreshold(otherZeroThreshold)
                        h.trimBucketsInZeroBucket()
                }
        }
        return otherZeroCount
}

// floatBucketIterator is a low-level constructor for bucket iterators.
//
// If positive is true, the returned iterator iterates through the positive
// buckets, otherwise through the negative buckets.
//
// Only for exponential schemas, if absoluteStartValue is < the lowest absolute
// value of any upper bucket boundary, the iterator starts with the first bucket.
// Otherwise, it will skip all buckets with an absolute value of their upper boundary ≤
// absoluteStartValue. For custom bucket schemas, absoluteStartValue is ignored and
// no buckets are skipped.
//
// targetSchema must be ≤ the schema of FloatHistogram (and of course within the
// legal values for schemas in general). The buckets are merged to match the
// targetSchema prior to iterating (without mutating FloatHistogram), but custom buckets
// schemas cannot be merged with other schemas.
func (h *FloatHistogram) floatBucketIterator(
        positive bool, absoluteStartValue float64, targetSchema int32,
) floatBucketIterator {
        if h.UsesCustomBuckets() && targetSchema != h.Schema {
                panic(fmt.Errorf("cannot merge from custom buckets schema to exponential schema"))
        }
        if !h.UsesCustomBuckets() && IsCustomBucketsSchema(targetSchema) {
                panic(fmt.Errorf("cannot merge from exponential buckets schema to custom schema"))
        }
        if targetSchema > h.Schema {
                panic(fmt.Errorf("cannot merge from schema %d to %d", h.Schema, targetSchema))
        }
        i := floatBucketIterator{
                baseBucketIterator: baseBucketIterator[float64, float64]{
                        schema:   h.Schema,
                        positive: positive,
                },
                targetSchema:           targetSchema,
                absoluteStartValue:     absoluteStartValue,
                boundReachedStartValue: absoluteStartValue == 0,
        }
        if positive {
                i.spans = h.PositiveSpans
                i.buckets = h.PositiveBuckets
                i.customValues = h.CustomValues
        } else {
                i.spans = h.NegativeSpans
                i.buckets = h.NegativeBuckets
        }
        return i
}

// reverseFloatBucketIterator is a low-level constructor for reverse bucket iterators.
func newReverseFloatBucketIterator(
        spans []Span, buckets []float64, schema int32, positive bool, customValues []float64,
) reverseFloatBucketIterator {
        r := reverseFloatBucketIterator{
                baseBucketIterator: baseBucketIterator[float64, float64]{
                        schema:       schema,
                        spans:        spans,
                        buckets:      buckets,
                        positive:     positive,
                        customValues: customValues,
                },
        }

        r.spansIdx = len(r.spans) - 1
        r.bucketsIdx = len(r.buckets) - 1
        if r.spansIdx >= 0 {
                r.idxInSpan = int32(r.spans[r.spansIdx].Length) - 1
        }
        r.currIdx = 0
        for _, s := range r.spans {
                r.currIdx += s.Offset + int32(s.Length)
        }

        return r
}

type floatBucketIterator struct {
        baseBucketIterator[float64, float64]

        targetSchema       int32   // targetSchema is the schema to merge to and must be ≤ schema.
        origIdx            int32   // The bucket index within the original schema.
        absoluteStartValue float64 // Never return buckets with an upper bound ≤ this value.

        boundReachedStartValue bool // Has getBound reached absoluteStartValue already?
}

func (i *floatBucketIterator) At() Bucket[float64] {
        // Need to use i.targetSchema rather than i.baseBucketIterator.schema.
        return i.baseBucketIterator.at(i.targetSchema)
}

func (i *floatBucketIterator) Next() bool {
        if i.spansIdx >= len(i.spans) {
                return false
        }

        if i.schema == i.targetSchema {
                // Fast path for the common case.
                span := i.spans[i.spansIdx]
                if i.bucketsIdx == 0 {
                        // Seed origIdx for the first bucket.
                        i.currIdx = span.Offset
                } else {
                        i.currIdx++
                }

                for i.idxInSpan >= span.Length {
                        // We have exhausted the current span and have to find a new
                        // one. We even handle pathologic spans of length 0 here.
                        i.idxInSpan = 0
                        i.spansIdx++
                        if i.spansIdx >= len(i.spans) {
                                return false
                        }
                        span = i.spans[i.spansIdx]
                        i.currIdx += span.Offset
                }

                i.currCount = i.buckets[i.bucketsIdx]
                i.idxInSpan++
                i.bucketsIdx++
        } else {
                // Copy all of these into local variables so that we can forward to the
                // next bucket and then roll back if needed.
                origIdx, spansIdx, idxInSpan := i.origIdx, i.spansIdx, i.idxInSpan
                span := i.spans[spansIdx]
                firstPass := true
                i.currCount = 0

        mergeLoop: // Merge together all buckets from the original schema that fall into one bucket in the targetSchema.
                for {
                        if i.bucketsIdx == 0 {
                                // Seed origIdx for the first bucket.
                                origIdx = span.Offset
                        } else {
                                origIdx++
                        }
                        for idxInSpan >= span.Length {
                                // We have exhausted the current span and have to find a new
                                // one. We even handle pathologic spans of length 0 here.
                                idxInSpan = 0
                                spansIdx++
                                if spansIdx >= len(i.spans) {
                                        if firstPass {
                                                return false
                                        }
                                        break mergeLoop
                                }
                                span = i.spans[spansIdx]
                                origIdx += span.Offset
                        }
                        currIdx := targetIdx(origIdx, i.schema, i.targetSchema)
                        switch {
                        case firstPass:
                                i.currIdx = currIdx
                                firstPass = false
                        case currIdx != i.currIdx:
                                // Reached next bucket in targetSchema.
                                // Do not actually forward to the next bucket, but break out.
                                break mergeLoop
                        }
                        i.currCount += i.buckets[i.bucketsIdx]
                        idxInSpan++
                        i.bucketsIdx++
                        i.origIdx, i.spansIdx, i.idxInSpan = origIdx, spansIdx, idxInSpan
                        if i.schema == i.targetSchema {
                                // Don't need to test the next bucket for mergeability
                                // if we have no schema change anyway.
                                break mergeLoop
                        }
                }
        }

        // Skip buckets before absoluteStartValue for exponential schemas.
        // TODO(beorn7): Maybe do something more efficient than this recursive call.
        if !i.boundReachedStartValue && IsExponentialSchema(i.targetSchema) && getBoundExponential(i.currIdx, i.targetSchema) <= i.absoluteStartValue {
                return i.Next()
        }
        i.boundReachedStartValue = true
        return true
}

type reverseFloatBucketIterator struct {
        baseBucketIterator[float64, float64]
        idxInSpan int32 // Changed from uint32 to allow negative values for exhaustion detection.
}

func (i *reverseFloatBucketIterator) Next() bool {
        i.currIdx--
        if i.bucketsIdx < 0 {
                return false
        }

        for i.idxInSpan < 0 {
                // We have exhausted the current span and have to find a new
                // one. We'll even handle pathologic spans of length 0.
                i.spansIdx--
                i.idxInSpan = int32(i.spans[i.spansIdx].Length) - 1
                i.currIdx -= i.spans[i.spansIdx+1].Offset
        }

        i.currCount = i.buckets[i.bucketsIdx]
        i.bucketsIdx--
        i.idxInSpan--
        return true
}

type allFloatBucketIterator struct {
        h         *FloatHistogram
        leftIter  reverseFloatBucketIterator
        rightIter floatBucketIterator
        // -1 means we are iterating negative buckets.
        // 0 means it is time for the zero bucket.
        // 1 means we are iterating positive buckets.
        // Anything else means iteration is over.
        state      int8
        currBucket Bucket[float64]
}

func (i *allFloatBucketIterator) Next() bool {
        switch i.state {
        case -1:
                if i.leftIter.Next() {
                        i.currBucket = i.leftIter.At()
                        switch {
                        case i.currBucket.Upper < 0 && i.currBucket.Upper > -i.h.ZeroThreshold:
                                i.currBucket.Upper = -i.h.ZeroThreshold
                        case i.currBucket.Lower > 0 && i.currBucket.Lower < i.h.ZeroThreshold:
                                i.currBucket.Lower = i.h.ZeroThreshold
                        }
                        return true
                }
                i.state = 0
                return i.Next()
        case 0:
                i.state = 1
                if i.h.ZeroCount > 0 {
                        i.currBucket = i.h.ZeroBucket()
                        return true
                }
                return i.Next()
        case 1:
                if i.rightIter.Next() {
                        i.currBucket = i.rightIter.At()
                        switch {
                        case i.currBucket.Lower > 0 && i.currBucket.Lower < i.h.ZeroThreshold:
                                i.currBucket.Lower = i.h.ZeroThreshold
                        case i.currBucket.Upper < 0 && i.currBucket.Upper > -i.h.ZeroThreshold:
                                i.currBucket.Upper = -i.h.ZeroThreshold
                        }
                        return true
                }
                i.state = 42
                return false
        }

        return false
}

func (i *allFloatBucketIterator) At() Bucket[float64] {
        return i.currBucket
}

// targetIdx returns the bucket index in the target schema for the given bucket
// index idx in the original schema.
func targetIdx(idx, originSchema, targetSchema int32) int32 {
        return ((idx - 1) >> (originSchema - targetSchema)) + 1
}

// addBuckets adds the buckets described by spansB/bucketsB to the buckets described by spansA/bucketsA,
// creating missing buckets in spansA/bucketsA as needed.
// It returns the resulting spans/buckets (which must be used instead of the original spansA/bucketsA,
// although spansA/bucketsA might get modified by this function).
// All buckets must use the same provided schema.
// Buckets in spansB/bucketsB with an absolute upper limit ≤ threshold are ignored.
// If negative is true, the buckets in spansB/bucketsB are subtracted rather than added.
func addBuckets(
        schema int32, threshold float64, negative bool,
        spansA []Span, bucketsA []float64,
        spansB []Span, bucketsB []float64,
) ([]Span, []float64) {
        var (
                iSpan              = -1
                iBucket            = -1
                iInSpan            int32
                indexA             int32
                indexB             int32
                bIdxB              int
                bucketB            float64
                deltaIndex         int32
                lowerThanThreshold = true
        )

        for _, spanB := range spansB {
                indexB += spanB.Offset
                for j := 0; j < int(spanB.Length); j++ {
                        if lowerThanThreshold && IsExponentialSchema(schema) && getBoundExponential(indexB, schema) <= threshold {
                                goto nextLoop
                        }
                        lowerThanThreshold = false

                        bucketB = bucketsB[bIdxB]
                        if negative {
                                bucketB *= -1
                        }

                        if iSpan == -1 {
                                if len(spansA) == 0 || spansA[0].Offset > indexB {
                                        // Add bucket before all others.
                                        bucketsA = append(bucketsA, 0)
                                        copy(bucketsA[1:], bucketsA)
                                        bucketsA[0] = bucketB
                                        if len(spansA) > 0 && spansA[0].Offset == indexB+1 {
                                                spansA[0].Length++
                                                spansA[0].Offset--
                                                goto nextLoop
                                        }
                                        spansA = append(spansA, Span{})
                                        copy(spansA[1:], spansA)
                                        spansA[0] = Span{Offset: indexB, Length: 1}
                                        if len(spansA) > 1 {
                                                // Convert the absolute offset in the formerly
                                                // first span to a relative offset.
                                                spansA[1].Offset -= indexB + 1
                                        }
                                        goto nextLoop
                                } else if spansA[0].Offset == indexB {
                                        // Just add to first bucket.
                                        bucketsA[0] += bucketB
                                        goto nextLoop
                                }
                                iSpan, iBucket, iInSpan = 0, 0, 0
                                indexA = spansA[0].Offset
                        }
                        deltaIndex = indexB - indexA
                        for {
                                remainingInSpan := int32(spansA[iSpan].Length) - iInSpan
                                if deltaIndex < remainingInSpan {
                                        // Bucket is in current span.
                                        iBucket += int(deltaIndex)
                                        iInSpan += deltaIndex
                                        bucketsA[iBucket] += bucketB
                                        break
                                }
                                deltaIndex -= remainingInSpan
                                iBucket += int(remainingInSpan)
                                iSpan++
                                if iSpan == len(spansA) || deltaIndex < spansA[iSpan].Offset {
                                        // Bucket is in gap behind previous span (or there are no further spans).
                                        bucketsA = append(bucketsA, 0)
                                        copy(bucketsA[iBucket+1:], bucketsA[iBucket:])
                                        bucketsA[iBucket] = bucketB
                                        switch {
                                        case deltaIndex == 0:
                                                // Directly after previous span, extend previous span.
                                                if iSpan < len(spansA) {
                                                        spansA[iSpan].Offset--
                                                }
                                                iSpan--
                                                iInSpan = int32(spansA[iSpan].Length)
                                                spansA[iSpan].Length++
                                                goto nextLoop
                                        case iSpan < len(spansA) && deltaIndex == spansA[iSpan].Offset-1:
                                                // Directly before next span, extend next span.
                                                iInSpan = 0
                                                spansA[iSpan].Offset--
                                                spansA[iSpan].Length++
                                                goto nextLoop
                                        default:
                                                // No next span, or next span is not directly adjacent to new bucket.
                                                // Add new span.
                                                iInSpan = 0
                                                if iSpan < len(spansA) {
                                                        spansA[iSpan].Offset -= deltaIndex + 1
                                                }
                                                spansA = append(spansA, Span{})
                                                copy(spansA[iSpan+1:], spansA[iSpan:])
                                                spansA[iSpan] = Span{Length: 1, Offset: deltaIndex}
                                                goto nextLoop
                                        }
                                } else {
                                        // Try start of next span.
                                        deltaIndex -= spansA[iSpan].Offset
                                        iInSpan = 0
                                }
                        }

                nextLoop:
                        indexA = indexB
                        indexB++
                        bIdxB++
                }
        }

        return spansA, bucketsA
}

func FloatBucketsMatch(b1, b2 []float64) bool {
        if len(b1) != len(b2) {
                return false
        }
        for i, b := range b1 {
                if math.Float64bits(b) != math.Float64bits(b2[i]) {
                        return false
                }
        }
        return true
}

// ReduceResolution reduces the float histogram's spans, buckets into target schema.
// The target schema must be smaller than the current float histogram's schema.
// This will panic if the histogram has custom buckets or if the target schema is
// a custom buckets schema.
func (h *FloatHistogram) ReduceResolution(targetSchema int32) *FloatHistogram {
        if h.UsesCustomBuckets() {
                panic("cannot reduce resolution when there are custom buckets")
        }
        if IsCustomBucketsSchema(targetSchema) {
                panic("cannot reduce resolution to custom buckets schema")
        }
        if targetSchema >= h.Schema {
                panic(fmt.Errorf("cannot reduce resolution from schema %d to %d", h.Schema, targetSchema))
        }

        h.PositiveSpans, h.PositiveBuckets = reduceResolution(h.PositiveSpans, h.PositiveBuckets, h.Schema, targetSchema, false, true)
        h.NegativeSpans, h.NegativeBuckets = reduceResolution(h.NegativeSpans, h.NegativeBuckets, h.Schema, targetSchema, false, true)

        h.Schema = targetSchema
        return h
}

// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package histogram

import (
        "errors"
        "fmt"
        "math"
        "strings"
)

const (
        ExponentialSchemaMax int32 = 8
        ExponentialSchemaMin int32 = -4
        CustomBucketsSchema  int32 = -53
)

var (
        ErrHistogramCountNotBigEnough     = errors.New("histogram's observation count should be at least the number of observations found in the buckets")
        ErrHistogramCountMismatch         = errors.New("histogram's observation count should equal the number of observations found in the buckets (in absence of NaN)")
        ErrHistogramNegativeBucketCount   = errors.New("histogram has a bucket whose observation count is negative")
        ErrHistogramSpanNegativeOffset    = errors.New("histogram has a span whose offset is negative")
        ErrHistogramSpansBucketsMismatch  = errors.New("histogram spans specify different number of buckets than provided")
        ErrHistogramCustomBucketsMismatch = errors.New("histogram custom bounds are too few")
        ErrHistogramCustomBucketsInvalid  = errors.New("histogram custom bounds must be in strictly increasing order")
        ErrHistogramCustomBucketsInfinite = errors.New("histogram custom bounds must be finite")
        ErrHistogramsIncompatibleSchema   = errors.New("cannot apply this operation on histograms with a mix of exponential and custom bucket schemas")
        ErrHistogramsIncompatibleBounds   = errors.New("cannot apply this operation on custom buckets histograms with different custom bounds")
)

func IsCustomBucketsSchema(s int32) bool {
        return s == CustomBucketsSchema
}

func IsExponentialSchema(s int32) bool {
        return s >= ExponentialSchemaMin && s <= ExponentialSchemaMax
}

// BucketCount is a type constraint for the count in a bucket, which can be
// float64 (for type FloatHistogram) or uint64 (for type Histogram).
type BucketCount interface {
        float64 | uint64
}

// InternalBucketCount is used internally by Histogram and FloatHistogram. The
// difference to the BucketCount above is that Histogram internally uses deltas
// between buckets rather than absolute counts (while FloatHistogram uses
// absolute counts directly). Go type parameters don't allow type
// specialization. Therefore, where special treatment of deltas between buckets
// vs. absolute counts is important, this information has to be provided as a
// separate boolean parameter "deltaBuckets".
type InternalBucketCount interface {
        float64 | int64
}

// Bucket represents a bucket with lower and upper limit and the absolute count
// of samples in the bucket. It also specifies if each limit is inclusive or
// not. (Mathematically, inclusive limits create a closed interval, and
// non-inclusive limits an open interval.)
//
// To represent cumulative buckets, Lower is set to -Inf, and the Count is then
// cumulative (including the counts of all buckets for smaller values).
type Bucket[BC BucketCount] struct {
        Lower, Upper                   float64
        LowerInclusive, UpperInclusive bool
        Count                          BC

        // Index within schema. To easily compare buckets that share the same
        // schema and sign (positive or negative). Irrelevant for the zero bucket.
        Index int32
}

// strippedBucket is Bucket without bound values (which are expensive to calculate
// and not used in certain use cases).
type strippedBucket[BC BucketCount] struct {
        count BC
        index int32
}

// String returns a string representation of a Bucket, using the usual
// mathematical notation of '['/']' for inclusive bounds and '('/')' for
// non-inclusive bounds.
func (b Bucket[BC]) String() string {
        var sb strings.Builder
        if b.LowerInclusive {
                sb.WriteRune('[')
        } else {
                sb.WriteRune('(')
        }
        fmt.Fprintf(&sb, "%g,%g", b.Lower, b.Upper)
        if b.UpperInclusive {
                sb.WriteRune(']')
        } else {
                sb.WriteRune(')')
        }
        fmt.Fprintf(&sb, ":%v", b.Count)
        return sb.String()
}

// BucketIterator iterates over the buckets of a Histogram, returning decoded
// buckets.
type BucketIterator[BC BucketCount] interface {
        // Next advances the iterator by one.
        Next() bool
        // At returns the current bucket.
        At() Bucket[BC]
}

// baseBucketIterator provides a struct that is shared by most BucketIterator
// implementations, together with an implementation of the At method. This
// iterator can be embedded in full implementations of BucketIterator to save on
// code replication.
type baseBucketIterator[BC BucketCount, IBC InternalBucketCount] struct {
        schema  int32
        spans   []Span
        buckets []IBC

        positive bool // Whether this is for positive buckets.

        spansIdx   int    // Current span within spans slice.
        idxInSpan  uint32 // Index in the current span. 0 <= idxInSpan < span.Length.
        bucketsIdx int    // Current bucket within buckets slice.

        currCount IBC   // Count in the current bucket.
        currIdx   int32 // The actual bucket index.

        customValues []float64 // Bounds (usually upper) for histograms with custom buckets.
}

func (b *baseBucketIterator[BC, IBC]) At() Bucket[BC] {
        return b.at(b.schema)
}

// at is an internal version of the exported At to enable using a different schema.
func (b *baseBucketIterator[BC, IBC]) at(schema int32) Bucket[BC] {
        bucket := Bucket[BC]{
                Count: BC(b.currCount),
                Index: b.currIdx,
        }
        if b.positive {
                bucket.Upper = getBound(b.currIdx, schema, b.customValues)
                bucket.Lower = getBound(b.currIdx-1, schema, b.customValues)
        } else {
                bucket.Lower = -getBound(b.currIdx, schema, b.customValues)
                bucket.Upper = -getBound(b.currIdx-1, schema, b.customValues)
        }
        if IsCustomBucketsSchema(schema) {
                bucket.LowerInclusive = b.currIdx == 0
                bucket.UpperInclusive = true
        } else {
                bucket.LowerInclusive = bucket.Lower < 0
                bucket.UpperInclusive = bucket.Upper > 0
        }
        return bucket
}

// strippedAt returns current strippedBucket (which lacks bucket bounds but is cheaper to compute).
func (b *baseBucketIterator[BC, IBC]) strippedAt() strippedBucket[BC] {
        return strippedBucket[BC]{
                count: BC(b.currCount),
                index: b.currIdx,
        }
}

// compactBuckets is a generic function used by both Histogram.Compact and
// FloatHistogram.Compact. Set deltaBuckets to true if the provided buckets are
// deltas. Set it to false if the buckets contain absolute counts.
func compactBuckets[IBC InternalBucketCount](buckets []IBC, spans []Span, maxEmptyBuckets int, deltaBuckets bool) ([]IBC, []Span) {
        // Fast path: If there are no empty buckets AND no offset in any span is
        // <= maxEmptyBuckets AND no span has length 0, there is nothing to do and we can return
        // immediately. We check that first because it's cheap and presumably
        // common.
        nothingToDo := true
        var currentBucketAbsolute IBC
        for _, bucket := range buckets {
                if deltaBuckets {
                        currentBucketAbsolute += bucket
                } else {
                        currentBucketAbsolute = bucket
                }
                if currentBucketAbsolute == 0 {
                        nothingToDo = false
                        break
                }
        }
        if nothingToDo {
                for _, span := range spans {
                        if int(span.Offset) <= maxEmptyBuckets || span.Length == 0 {
                                nothingToDo = false
                                break
                        }
                }
                if nothingToDo {
                        return buckets, spans
                }
        }

        var iBucket, iSpan int
        var posInSpan uint32
        currentBucketAbsolute = 0

        // Helper function.
        emptyBucketsHere := func() int {
                i := 0
                abs := currentBucketAbsolute
                for uint32(i)+posInSpan < spans[iSpan].Length && abs == 0 {
                        i++
                        if i+iBucket >= len(buckets) {
                                break
                        }
                        abs = buckets[i+iBucket]
                }
                return i
        }

        // Merge spans with zero-offset to avoid special cases later.
        if len(spans) > 1 {
                for i, span := range spans[1:] {
                        if span.Offset == 0 {
                                spans[iSpan].Length += span.Length
                                continue
                        }
                        iSpan++
                        if i+1 != iSpan {
                                spans[iSpan] = span
                        }
                }
                spans = spans[:iSpan+1]
                iSpan = 0
        }

        // Merge spans with zero-length to avoid special cases later.
        for i, span := range spans {
                if span.Length == 0 {
                        if i+1 < len(spans) {
                                spans[i+1].Offset += span.Offset
                        }
                        continue
                }
                if i != iSpan {
                        spans[iSpan] = span
                }
                iSpan++
        }
        spans = spans[:iSpan]
        iSpan = 0

        // Cut out empty buckets from start and end of spans, no matter
        // what. Also cut out empty buckets from the middle of a span but only
        // if there are more than maxEmptyBuckets consecutive empty buckets.
        for iBucket < len(buckets) {
                if deltaBuckets {
                        currentBucketAbsolute += buckets[iBucket]
                } else {
                        currentBucketAbsolute = buckets[iBucket]
                }
                if nEmpty := emptyBucketsHere(); nEmpty > 0 {
                        if posInSpan > 0 &&
                                nEmpty < int(spans[iSpan].Length-posInSpan) &&
                                nEmpty <= maxEmptyBuckets {
                                // The empty buckets are in the middle of a
                                // span, and there are few enough to not bother.
                                // Just fast-forward.
                                iBucket += nEmpty
                                if deltaBuckets {
                                        currentBucketAbsolute = 0
                                }
                                posInSpan += uint32(nEmpty)
                                continue
                        }
                        // In all other cases, we cut out the empty buckets.
                        if deltaBuckets && iBucket+nEmpty < len(buckets) {
                                currentBucketAbsolute = -buckets[iBucket]
                                buckets[iBucket+nEmpty] += buckets[iBucket]
                        }
                        buckets = append(buckets[:iBucket], buckets[iBucket+nEmpty:]...)
                        if posInSpan == 0 {
                                // Start of span.
                                if nEmpty == int(spans[iSpan].Length) {
                                        // The whole span is empty.
                                        offset := spans[iSpan].Offset
                                        spans = append(spans[:iSpan], spans[iSpan+1:]...)
                                        if len(spans) > iSpan {
                                                spans[iSpan].Offset += offset + int32(nEmpty)
                                        }
                                        continue
                                }
                                spans[iSpan].Length -= uint32(nEmpty)
                                spans[iSpan].Offset += int32(nEmpty)
                                continue
                        }
                        // It's in the middle or in the end of the span.
                        // Split the current span.
                        newSpan := Span{
                                Offset: int32(nEmpty),
                                Length: spans[iSpan].Length - posInSpan - uint32(nEmpty),
                        }
                        spans[iSpan].Length = posInSpan
                        // In any case, we have to split to the next span.
                        iSpan++
                        posInSpan = 0
                        if newSpan.Length == 0 {
                                // The span is empty, so we were already at the end of a span.
                                // We don't have to insert the new span, just adjust the next
                                // span's offset, if there is one.
                                if iSpan < len(spans) {
                                        spans[iSpan].Offset += int32(nEmpty)
                                }
                                continue
                        }
                        // Insert the new span.
                        spans = append(spans, Span{})
                        if iSpan+1 < len(spans) {
                                copy(spans[iSpan+1:], spans[iSpan:])
                        }
                        spans[iSpan] = newSpan
                        continue
                }
                iBucket++
                posInSpan++
                if posInSpan >= spans[iSpan].Length {
                        posInSpan = 0
                        iSpan++
                }
        }
        if maxEmptyBuckets == 0 || len(buckets) == 0 {
                return buckets, spans
        }

        // Finally, check if any offsets between spans are small enough to merge
        // the spans.
        iBucket = int(spans[0].Length)
        if deltaBuckets {
                currentBucketAbsolute = 0
                for _, bucket := range buckets[:iBucket] {
                        currentBucketAbsolute += bucket
                }
        }
        iSpan = 1
        for iSpan < len(spans) {
                if int(spans[iSpan].Offset) > maxEmptyBuckets {
                        l := int(spans[iSpan].Length)
                        if deltaBuckets {
                                for _, bucket := range buckets[iBucket : iBucket+l] {
                                        currentBucketAbsolute += bucket
                                }
                        }
                        iBucket += l
                        iSpan++
                        continue
                }
                // Merge span with previous one and insert empty buckets.
                offset := int(spans[iSpan].Offset)
                spans[iSpan-1].Length += uint32(offset) + spans[iSpan].Length
                spans = append(spans[:iSpan], spans[iSpan+1:]...)
                newBuckets := make([]IBC, len(buckets)+offset)
                copy(newBuckets, buckets[:iBucket])
                copy(newBuckets[iBucket+offset:], buckets[iBucket:])
                if deltaBuckets {
                        newBuckets[iBucket] = -currentBucketAbsolute
                        newBuckets[iBucket+offset] += currentBucketAbsolute
                }
                iBucket += offset
                buckets = newBuckets
                currentBucketAbsolute = buckets[iBucket]
                // Note that with many merges, it would be more efficient to
                // first record all the chunks of empty buckets to insert and
                // then do it in one go through all the buckets.
        }

        return buckets, spans
}

func checkHistogramSpans(spans []Span, numBuckets int) error {
        var spanBuckets int
        for n, span := range spans {
                if n > 0 && span.Offset < 0 {
                        return fmt.Errorf("span number %d with offset %d: %w", n+1, span.Offset, ErrHistogramSpanNegativeOffset)
                }
                spanBuckets += int(span.Length)
        }
        if spanBuckets != numBuckets {
                return fmt.Errorf("spans need %d buckets, have %d buckets: %w", spanBuckets, numBuckets, ErrHistogramSpansBucketsMismatch)
        }
        return nil
}

func checkHistogramBuckets[BC BucketCount, IBC InternalBucketCount](buckets []IBC, count *BC, deltas bool) error {
        if len(buckets) == 0 {
                return nil
        }

        var last IBC
        for i := 0; i < len(buckets); i++ {
                var c IBC
                if deltas {
                        c = last + buckets[i]
                } else {
                        c = buckets[i]
                }
                if c < 0 {
                        return fmt.Errorf("bucket number %d has observation count of %v: %w", i+1, c, ErrHistogramNegativeBucketCount)
                }
                last = c
                *count += BC(c)
        }

        return nil
}

func checkHistogramCustomBounds(bounds []float64, spans []Span, numBuckets int) error {
        prev := math.Inf(-1)
        for _, curr := range bounds {
                if curr <= prev {
                        return fmt.Errorf("previous bound is %f and current is %f: %w", prev, curr, ErrHistogramCustomBucketsInvalid)
                }
                prev = curr
        }
        if prev == math.Inf(1) {
                return fmt.Errorf("last +Inf bound must not be explicitly defined: %w", ErrHistogramCustomBucketsInfinite)
        }

        var spanBuckets int
        var totalSpanLength int
        for n, span := range spans {
                if span.Offset < 0 {
                        return fmt.Errorf("span number %d with offset %d: %w", n+1, span.Offset, ErrHistogramSpanNegativeOffset)
                }
                spanBuckets += int(span.Length)
                totalSpanLength += int(span.Length) + int(span.Offset)
        }
        if spanBuckets != numBuckets {
                return fmt.Errorf("spans need %d buckets, have %d buckets: %w", spanBuckets, numBuckets, ErrHistogramSpansBucketsMismatch)
        }
        if (len(bounds) + 1) < totalSpanLength {
                return fmt.Errorf("only %d custom bounds defined which is insufficient to cover total span length of %d: %w", len(bounds), totalSpanLength, ErrHistogramCustomBucketsMismatch)
        }

        return nil
}

func getBound(idx, schema int32, customValues []float64) float64 {
        if IsCustomBucketsSchema(schema) {
                length := int32(len(customValues))
                switch {
                case idx > length || idx < -1:
                        panic(fmt.Errorf("index %d out of bounds for custom bounds of length %d", idx, length))
                case idx == length:
                        return math.Inf(1)
                case idx == -1:
                        return math.Inf(-1)
                default:
                        return customValues[idx]
                }
        }
        return getBoundExponential(idx, schema)
}

func getBoundExponential(idx, schema int32) float64 {
        // Here a bit of context about the behavior for the last bucket counting
        // regular numbers (called simply "last bucket" below) and the bucket
        // counting observations of ±Inf (called "inf bucket" below, with an idx
        // one higher than that of the "last bucket"):
        //
        // If we apply the usual formula to the last bucket, its upper bound
        // would be calculated as +Inf. The reason is that the max possible
        // regular float64 number (math.MaxFloat64) doesn't coincide with one of
        // the calculated bucket boundaries. So the calculated boundary has to
        // be larger than math.MaxFloat64, and the only float64 larger than
        // math.MaxFloat64 is +Inf. However, we want to count actual
        // observations of ±Inf in the inf bucket. Therefore, we have to treat
        // the upper bound of the last bucket specially and set it to
        // math.MaxFloat64. (The upper bound of the inf bucket, with its idx
        // being one higher than that of the last bucket, naturally comes out as
        // +Inf by the usual formula. So that's fine.)
        //
        // math.MaxFloat64 has a frac of 0.9999999999999999 and an exp of
        // 1024. If there were a float64 number following math.MaxFloat64, it
        // would have a frac of 1.0 and an exp of 1024, or equivalently a frac
        // of 0.5 and an exp of 1025. However, since frac must be smaller than
        // 1, and exp must be smaller than 1025, either representation overflows
        // a float64. (Which, in turn, is the reason that math.MaxFloat64 is the
        // largest possible float64. Q.E.D.) However, the formula for
        // calculating the upper bound from the idx and schema of the last
        // bucket results in precisely that. It is either frac=1.0 & exp=1024
        // (for schema < 0) or frac=0.5 & exp=1025 (for schema >=0). (This is,
        // by the way, a power of two where the exponent itself is a power of
        // two, 2¹⁰ in fact, which coincides with a bucket boundary in all
        // schemas.) So these are the special cases we have to catch below.
        if schema < 0 {
                exp := int(idx) << -schema
                if exp == 1024 {
                        // This is the last bucket before the overflow bucket
                        // (for ±Inf observations). Return math.MaxFloat64 as
                        // explained above.
                        return math.MaxFloat64
                }
                return math.Ldexp(1, exp)
        }

        fracIdx := idx & ((1 << schema) - 1)
        frac := exponentialBounds[schema][fracIdx]
        exp := (int(idx) >> schema) + 1
        if frac == 0.5 && exp == 1025 {
                // This is the last bucket before the overflow bucket (for ±Inf
                // observations). Return math.MaxFloat64 as explained above.
                return math.MaxFloat64
        }
        return math.Ldexp(frac, exp)
}

// exponentialBounds is a precalculated table of bucket bounds in the interval
// [0.5,1) in schema 0 to 8.
var exponentialBounds = [][]float64{
        // Schema "0":
        {0.5},
        // Schema 1:
        {0.5, 0.7071067811865475},
        // Schema 2:
        {0.5, 0.5946035575013605, 0.7071067811865475, 0.8408964152537144},
        // Schema 3:
        {
                0.5, 0.5452538663326288, 0.5946035575013605, 0.6484197773255048,
                0.7071067811865475, 0.7711054127039704, 0.8408964152537144, 0.9170040432046711,
        },
        // Schema 4:
        {
                0.5, 0.5221368912137069, 0.5452538663326288, 0.5693943173783458,
                0.5946035575013605, 0.620928906036742, 0.6484197773255048, 0.6771277734684463,
                0.7071067811865475, 0.7384130729697496, 0.7711054127039704, 0.805245165974627,
                0.8408964152537144, 0.8781260801866495, 0.9170040432046711, 0.9576032806985735,
        },
        // Schema 5:
        {
                0.5, 0.5109485743270583, 0.5221368912137069, 0.5335702003384117,
                0.5452538663326288, 0.5571933712979462, 0.5693943173783458, 0.5818624293887887,
                0.5946035575013605, 0.6076236799902344, 0.620928906036742, 0.6345254785958666,
                0.6484197773255048, 0.6626183215798706, 0.6771277734684463, 0.6919549409819159,
                0.7071067811865475, 0.7225904034885232, 0.7384130729697496, 0.7545822137967112,
                0.7711054127039704, 0.7879904225539431, 0.805245165974627, 0.8228777390769823,
                0.8408964152537144, 0.8593096490612387, 0.8781260801866495, 0.8973545375015533,
                0.9170040432046711, 0.9370838170551498, 0.9576032806985735, 0.9785720620876999,
        },
        // Schema 6:
        {
                0.5, 0.5054446430258502, 0.5109485743270583, 0.5165124395106142,
                0.5221368912137069, 0.5278225891802786, 0.5335702003384117, 0.5393803988785598,
                0.5452538663326288, 0.5511912916539204, 0.5571933712979462, 0.5632608093041209,
                0.5693943173783458, 0.5755946149764913, 0.5818624293887887, 0.5881984958251406,
                0.5946035575013605, 0.6010783657263515, 0.6076236799902344, 0.6142402680534349,
                0.620928906036742, 0.6276903785123455, 0.6345254785958666, 0.6414350080393891,
                0.6484197773255048, 0.6554806057623822, 0.6626183215798706, 0.6698337620266515,
                0.6771277734684463, 0.6845012114872953, 0.6919549409819159, 0.6994898362691555,
                0.7071067811865475, 0.7148066691959849, 0.7225904034885232, 0.7304588970903234,
                0.7384130729697496, 0.7464538641456323, 0.7545822137967112, 0.762799075372269,
                0.7711054127039704, 0.7795022001189185, 0.7879904225539431, 0.7965710756711334,
                0.805245165974627, 0.8140137109286738, 0.8228777390769823, 0.8318382901633681,
                0.8408964152537144, 0.8500531768592616, 0.8593096490612387, 0.8686669176368529,
                0.8781260801866495, 0.8876882462632604, 0.8973545375015533, 0.9071260877501991,
                0.9170040432046711, 0.9269895625416926, 0.9370838170551498, 0.9472879907934827,
                0.9576032806985735, 0.9680308967461471, 0.9785720620876999, 0.9892280131939752,
        },
        // Schema 7:
        {
                0.5, 0.5027149505564014, 0.5054446430258502, 0.5081891574554764,
                0.5109485743270583, 0.5137229745593818, 0.5165124395106142, 0.5193170509806894,
                0.5221368912137069, 0.5249720429003435, 0.5278225891802786, 0.5306886136446309,
                0.5335702003384117, 0.5364674337629877, 0.5393803988785598, 0.5423091811066545,
                0.5452538663326288, 0.5482145409081883, 0.5511912916539204, 0.5541842058618393,
                0.5571933712979462, 0.5602188762048033, 0.5632608093041209, 0.5663192597993595,
                0.5693943173783458, 0.572486072215902, 0.5755946149764913, 0.5787200368168754,
                0.5818624293887887, 0.585021884841625, 0.5881984958251406, 0.5913923554921704,
                0.5946035575013605, 0.5978321960199137, 0.6010783657263515, 0.6043421618132907,
                0.6076236799902344, 0.6109230164863786, 0.6142402680534349, 0.6175755319684665,
                0.620928906036742, 0.6243004885946023, 0.6276903785123455, 0.6310986751971253,
                0.6345254785958666, 0.637970889198196, 0.6414350080393891, 0.6449179367033329,
                0.6484197773255048, 0.6519406325959679, 0.6554806057623822, 0.659039800633032,
                0.6626183215798706, 0.6662162735415805, 0.6698337620266515, 0.6734708931164728,
                0.6771277734684463, 0.6808045103191123, 0.6845012114872953, 0.688217985377265,
                0.6919549409819159, 0.6957121878859629, 0.6994898362691555, 0.7032879969095076,
                0.7071067811865475, 0.7109463010845827, 0.7148066691959849, 0.718687998724491,
                0.7225904034885232, 0.7265139979245261, 0.7304588970903234, 0.7344252166684908,
                0.7384130729697496, 0.7424225829363761, 0.7464538641456323, 0.7505070348132126,
                0.7545822137967112, 0.7586795205991071, 0.762799075372269, 0.7669409989204777,
                0.7711054127039704, 0.7752924388424999, 0.7795022001189185, 0.7837348199827764,
                0.7879904225539431, 0.7922691326262467, 0.7965710756711334, 0.8008963778413465,
                0.805245165974627, 0.8096175675974316, 0.8140137109286738, 0.8184337248834821,
                0.8228777390769823, 0.8273458838280969, 0.8318382901633681, 0.8363550898207981,
                0.8408964152537144, 0.8454623996346523, 0.8500531768592616, 0.8546688815502312,
                0.8593096490612387, 0.8639756154809185, 0.8686669176368529, 0.8733836930995842,
                0.8781260801866495, 0.8828942179666361, 0.8876882462632604, 0.8925083056594671,
                0.8973545375015533, 0.9022270839033115, 0.9071260877501991, 0.9120516927035263,
                0.9170040432046711, 0.9219832844793128, 0.9269895625416926, 0.9320230241988943,
                0.9370838170551498, 0.9421720895161669, 0.9472879907934827, 0.9524316709088368,
                0.9576032806985735, 0.9628029718180622, 0.9680308967461471, 0.9732872087896164,
                0.9785720620876999, 0.9838856116165875, 0.9892280131939752, 0.9945994234836328,
        },
        // Schema 8:
        {
                0.5, 0.5013556375251013, 0.5027149505564014, 0.5040779490592088,
                0.5054446430258502, 0.5068150424757447, 0.5081891574554764, 0.509566998038869,
                0.5109485743270583, 0.5123338964485679, 0.5137229745593818, 0.5151158188430205,
                0.5165124395106142, 0.5179128468009786, 0.5193170509806894, 0.520725062344158,
                0.5221368912137069, 0.5235525479396449, 0.5249720429003435, 0.526395386502313,
                0.5278225891802786, 0.5292536613972564, 0.5306886136446309, 0.5321274564422321,
                0.5335702003384117, 0.5350168559101208, 0.5364674337629877, 0.5379219445313954,
                0.5393803988785598, 0.5408428074966075, 0.5423091811066545, 0.5437795304588847,
                0.5452538663326288, 0.5467321995364429, 0.5482145409081883, 0.549700901315111,
                0.5511912916539204, 0.5526857228508706, 0.5541842058618393, 0.5556867516724088,
                0.5571933712979462, 0.5587040757836845, 0.5602188762048033, 0.5617377836665098,
                0.5632608093041209, 0.564787964283144, 0.5663192597993595, 0.5678547070789026,
                0.5693943173783458, 0.5709381019847808, 0.572486072215902, 0.5740382394200894,
                0.5755946149764913, 0.5771552102951081, 0.5787200368168754, 0.5802891060137493,
                0.5818624293887887, 0.5834400184762408, 0.585021884841625, 0.5866080400818185,
                0.5881984958251406, 0.5897932637314379, 0.5913923554921704, 0.5929957828304968,
                0.5946035575013605, 0.5962156912915756, 0.5978321960199137, 0.5994530835371903,
                0.6010783657263515, 0.6027080545025619, 0.6043421618132907, 0.6059806996384005,
                0.6076236799902344, 0.6092711149137041, 0.6109230164863786, 0.6125793968185725,
                0.6142402680534349, 0.6159056423670379, 0.6175755319684665, 0.6192499490999082,
                0.620928906036742, 0.622612415087629, 0.6243004885946023, 0.6259931389331581,
                0.6276903785123455, 0.6293922197748583, 0.6310986751971253, 0.6328097572894031,
                0.6345254785958666, 0.6362458516947014, 0.637970889198196, 0.6397006037528346,
                0.6414350080393891, 0.6431741147730128, 0.6449179367033329, 0.6466664866145447,
                0.6484197773255048, 0.6501778216898253, 0.6519406325959679, 0.6537082229673385,
                0.6554806057623822, 0.6572577939746774, 0.659039800633032, 0.6608266388015788,
                0.6626183215798706, 0.6644148621029772, 0.6662162735415805, 0.6680225691020727,
                0.6698337620266515, 0.6716498655934177, 0.6734708931164728, 0.6752968579460171,
                0.6771277734684463, 0.6789636531064505, 0.6808045103191123, 0.6826503586020058,
                0.6845012114872953, 0.6863570825438342, 0.688217985377265, 0.690083933630119,
                0.6919549409819159, 0.6938310211492645, 0.6957121878859629, 0.6975984549830999,
                0.6994898362691555, 0.7013863456101023, 0.7032879969095076, 0.7051948041086352,
                0.7071067811865475, 0.7090239421602076, 0.7109463010845827, 0.7128738720527471,
                0.7148066691959849, 0.7167447066838943, 0.718687998724491, 0.7206365595643126,
                0.7225904034885232, 0.7245495448210174, 0.7265139979245261, 0.7284837772007218,
                0.7304588970903234, 0.7324393720732029, 0.7344252166684908, 0.7364164454346837,
                0.7384130729697496, 0.7404151139112358, 0.7424225829363761, 0.7444354947621984,
                0.7464538641456323, 0.7484777058836176, 0.7505070348132126, 0.7525418658117031,
                0.7545822137967112, 0.7566280937263048, 0.7586795205991071, 0.7607365094544071,
                0.762799075372269, 0.7648672334736434, 0.7669409989204777, 0.7690203869158282,
                0.7711054127039704, 0.7731960915705107, 0.7752924388424999, 0.7773944698885442,
                0.7795022001189185, 0.7816156449856788, 0.7837348199827764, 0.7858597406461707,
                0.7879904225539431, 0.7901268813264122, 0.7922691326262467, 0.7944171921585818,
                0.7965710756711334, 0.7987307989543135, 0.8008963778413465, 0.8030678282083853,
                0.805245165974627, 0.8074284071024302, 0.8096175675974316, 0.8118126635086642,
                0.8140137109286738, 0.8162207259936375, 0.8184337248834821, 0.820652723822003,
                0.8228777390769823, 0.8251087869603088, 0.8273458838280969, 0.8295890460808079,
                0.8318382901633681, 0.8340936325652911, 0.8363550898207981, 0.8386226785089391,
                0.8408964152537144, 0.8431763167241966, 0.8454623996346523, 0.8477546807446661,
                0.8500531768592616, 0.8523579048290255, 0.8546688815502312, 0.8569861239649629,
                0.8593096490612387, 0.8616394738731368, 0.8639756154809185, 0.8663180910111553,
                0.8686669176368529, 0.871022112577578, 0.8733836930995842, 0.8757516765159389,
                0.8781260801866495, 0.8805069215187917, 0.8828942179666361, 0.8852879870317771,
                0.8876882462632604, 0.890095013257712, 0.8925083056594671, 0.8949281411607002,
                0.8973545375015533, 0.8997875124702672, 0.9022270839033115, 0.9046732696855155,
                0.9071260877501991, 0.909585556079304, 0.9120516927035263, 0.9145245157024483,
                0.9170040432046711, 0.9194902933879467, 0.9219832844793128, 0.9244830347552253,
                0.9269895625416926, 0.92950288621441, 0.9320230241988943, 0.9345499949706191,
                0.9370838170551498, 0.93962450902828, 0.9421720895161669, 0.9447265771954693,
                0.9472879907934827, 0.9498563490882775, 0.9524316709088368, 0.9550139751351947,
                0.9576032806985735, 0.9601996065815236, 0.9628029718180622, 0.9654133954938133,
                0.9680308967461471, 0.9706554947643201, 0.9732872087896164, 0.9759260581154889,
                0.9785720620876999, 0.9812252401044634, 0.9838856116165875, 0.9865531961276168,
                0.9892280131939752, 0.9919100824251095, 0.9945994234836328, 0.9972960560854698,
        },
}

// reduceResolution reduces the input spans, buckets in origin schema to the spans, buckets in target schema.
// The target schema must be smaller than the original schema.
// Set deltaBuckets to true if the provided buckets are
// deltas. Set it to false if the buckets contain absolute counts.
// Set inplace to true to reuse input slices and avoid allocations (otherwise
// new slices will be allocated for result).
func reduceResolution[IBC InternalBucketCount](
        originSpans []Span,
        originBuckets []IBC,
        originSchema,
        targetSchema int32,
        deltaBuckets bool,
        inplace bool,
) ([]Span, []IBC) {
        var (
                targetSpans           []Span // The spans in the target schema.
                targetBuckets         []IBC  // The bucket counts in the target schema.
                bucketIdx             int32  // The index of bucket in the origin schema.
                bucketCountIdx        int    // The position of a bucket in origin bucket count slice `originBuckets`.
                targetBucketIdx       int32  // The index of bucket in the target schema.
                lastBucketCount       IBC    // The last visited bucket's count in the origin schema.
                lastTargetBucketIdx   int32  // The index of the last added target bucket.
                lastTargetBucketCount IBC
        )

        if inplace {
                // Slice reuse is safe because when reducing the resolution,
                // target slices don't grow faster than origin slices are being read.
                targetSpans = originSpans[:0]
                targetBuckets = originBuckets[:0]
        }

        for _, span := range originSpans {
                // Determine the index of the first bucket in this span.
                bucketIdx += span.Offset
                for j := 0; j < int(span.Length); j++ {
                        // Determine the index of the bucket in the target schema from the index in the original schema.
                        targetBucketIdx = targetIdx(bucketIdx, originSchema, targetSchema)

                        switch {
                        case len(targetSpans) == 0:
                                // This is the first span in the targetSpans.
                                span := Span{
                                        Offset: targetBucketIdx,
                                        Length: 1,
                                }
                                targetSpans = append(targetSpans, span)
                                targetBuckets = append(targetBuckets, originBuckets[bucketCountIdx])
                                lastTargetBucketIdx = targetBucketIdx
                                lastBucketCount = originBuckets[bucketCountIdx]
                                lastTargetBucketCount = originBuckets[bucketCountIdx]

                        case lastTargetBucketIdx == targetBucketIdx:
                                // The current bucket has to be merged into the same target bucket as the previous bucket.
                                if deltaBuckets {
                                        lastBucketCount += originBuckets[bucketCountIdx]
                                        targetBuckets[len(targetBuckets)-1] += lastBucketCount
                                        lastTargetBucketCount += lastBucketCount
                                } else {
                                        targetBuckets[len(targetBuckets)-1] += originBuckets[bucketCountIdx]
                                }

                        case (lastTargetBucketIdx + 1) == targetBucketIdx:
                                // The current bucket has to go into a new target bucket,
                                // and that bucket is next to the previous target bucket,
                                // so we add it to the current target span.
                                targetSpans[len(targetSpans)-1].Length++
                                lastTargetBucketIdx++
                                if deltaBuckets {
                                        lastBucketCount += originBuckets[bucketCountIdx]
                                        targetBuckets = append(targetBuckets, lastBucketCount-lastTargetBucketCount)
                                        lastTargetBucketCount = lastBucketCount
                                } else {
                                        targetBuckets = append(targetBuckets, originBuckets[bucketCountIdx])
                                }

                        case (lastTargetBucketIdx + 1) < targetBucketIdx:
                                // The current bucket has to go into a new target bucket,
                                // and that bucket is separated by a gap from the previous target bucket,
                                // so we need to add a new target span.
                                span := Span{
                                        Offset: targetBucketIdx - lastTargetBucketIdx - 1,
                                        Length: 1,
                                }
                                targetSpans = append(targetSpans, span)
                                lastTargetBucketIdx = targetBucketIdx
                                if deltaBuckets {
                                        lastBucketCount += originBuckets[bucketCountIdx]
                                        targetBuckets = append(targetBuckets, lastBucketCount-lastTargetBucketCount)
                                        lastTargetBucketCount = lastBucketCount
                                } else {
                                        targetBuckets = append(targetBuckets, originBuckets[bucketCountIdx])
                                }
                        }

                        bucketIdx++
                        bucketCountIdx++
                }
        }

        return targetSpans, targetBuckets
}

func clearIfNotNil[T any](items []T) []T {
        if items == nil {
                return nil
        }
        return items[:0]
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package histogram

import (
        "fmt"
        "math"
        "slices"
        "strings"
)

// CounterResetHint contains the known information about a counter reset,
// or alternatively that we are dealing with a gauge histogram, where counter resets do not apply.
type CounterResetHint byte

const (
        UnknownCounterReset CounterResetHint = iota // UnknownCounterReset means we cannot say if this histogram signals a counter reset or not.
        CounterReset                                // CounterReset means there was definitely a counter reset starting from this histogram.
        NotCounterReset                             // NotCounterReset means there was definitely no counter reset with this histogram.
        GaugeType                                   // GaugeType means this is a gauge histogram, where counter resets do not happen.
)

// Histogram encodes a sparse, high-resolution histogram. See the design
// document for full details:
// https://docs.google.com/document/d/1cLNv3aufPZb3fNfaJgdaRBZsInZKKIHo9E6HinJVbpM/edit#
//
// The most tricky bit is how bucket indices represent real bucket boundaries.
// An example for schema 0 (by which each bucket is twice as wide as the
// previous bucket):
//
//        Bucket boundaries →              [-2,-1)  [-1,-0.5) [-0.5,-0.25) ... [-0.001,0.001] ... (0.25,0.5] (0.5,1]  (1,2] ....
//                                            ↑        ↑           ↑                  ↑                ↑         ↑      ↑
//        Zero bucket (width e.g. 0.001) →    |        |           |                  ZB               |         |      |
//        Positive bucket indices →           |        |           |                          ...     -1         0      1    2    3
//        Negative bucket indices →  3   2    1        0          -1       ...
//
// Which bucket indices are actually used is determined by the spans.
type Histogram struct {
        // Counter reset information.
        CounterResetHint CounterResetHint
        // Currently valid schema numbers are -4 <= n <= 8 for exponential buckets,
        // They are all for base-2 bucket schemas, where 1 is a bucket boundary in
        // each case, and then each power of two is divided into 2^n logarithmic buckets.
        // Or in other words, each bucket boundary is the previous boundary times
        // 2^(2^-n). Another valid schema number is -53 for custom buckets, defined by
        // the CustomValues field.
        Schema int32
        // Width of the zero bucket.
        ZeroThreshold float64
        // Observations falling into the zero bucket.
        ZeroCount uint64
        // Total number of observations.
        Count uint64
        // Sum of observations. This is also used as the stale marker.
        Sum float64
        // Spans for positive and negative buckets (see Span below).
        PositiveSpans, NegativeSpans []Span
        // Observation counts in buckets. The first element is an absolute
        // count. All following ones are deltas relative to the previous
        // element.
        PositiveBuckets, NegativeBuckets []int64
        // Holds the custom (usually upper) bounds for bucket definitions, otherwise nil.
        // This slice is interned, to be treated as immutable and copied by reference.
        // These numbers should be strictly increasing. This field is only used when the
        // schema is for custom buckets, and the ZeroThreshold, ZeroCount, NegativeSpans
        // and NegativeBuckets fields are not used in that case.
        CustomValues []float64
}

// A Span defines a continuous sequence of buckets.
type Span struct {
        // Gap to previous span (always positive), or starting index for the 1st
        // span (which can be negative).
        Offset int32
        // Length of the span.
        Length uint32
}

func (h *Histogram) UsesCustomBuckets() bool {
        return IsCustomBucketsSchema(h.Schema)
}

// Copy returns a deep copy of the Histogram.
func (h *Histogram) Copy() *Histogram {
        c := Histogram{
                CounterResetHint: h.CounterResetHint,
                Schema:           h.Schema,
                Count:            h.Count,
                Sum:              h.Sum,
        }

        if h.UsesCustomBuckets() {
                if len(h.CustomValues) != 0 {
                        c.CustomValues = make([]float64, len(h.CustomValues))
                        copy(c.CustomValues, h.CustomValues)
                }
        } else {
                c.ZeroThreshold = h.ZeroThreshold
                c.ZeroCount = h.ZeroCount

                if len(h.NegativeSpans) != 0 {
                        c.NegativeSpans = make([]Span, len(h.NegativeSpans))
                        copy(c.NegativeSpans, h.NegativeSpans)
                }
                if len(h.NegativeBuckets) != 0 {
                        c.NegativeBuckets = make([]int64, len(h.NegativeBuckets))
                        copy(c.NegativeBuckets, h.NegativeBuckets)
                }
        }

        if len(h.PositiveSpans) != 0 {
                c.PositiveSpans = make([]Span, len(h.PositiveSpans))
                copy(c.PositiveSpans, h.PositiveSpans)
        }
        if len(h.PositiveBuckets) != 0 {
                c.PositiveBuckets = make([]int64, len(h.PositiveBuckets))
                copy(c.PositiveBuckets, h.PositiveBuckets)
        }

        return &c
}

// CopyTo makes a deep copy into the given Histogram object.
// The destination object has to be a non-nil pointer.
func (h *Histogram) CopyTo(to *Histogram) {
        to.CounterResetHint = h.CounterResetHint
        to.Schema = h.Schema
        to.Count = h.Count
        to.Sum = h.Sum

        if h.UsesCustomBuckets() {
                to.ZeroThreshold = 0
                to.ZeroCount = 0

                to.NegativeSpans = clearIfNotNil(to.NegativeSpans)
                to.NegativeBuckets = clearIfNotNil(to.NegativeBuckets)

                to.CustomValues = resize(to.CustomValues, len(h.CustomValues))
                copy(to.CustomValues, h.CustomValues)
        } else {
                to.ZeroThreshold = h.ZeroThreshold
                to.ZeroCount = h.ZeroCount

                to.NegativeSpans = resize(to.NegativeSpans, len(h.NegativeSpans))
                copy(to.NegativeSpans, h.NegativeSpans)

                to.NegativeBuckets = resize(to.NegativeBuckets, len(h.NegativeBuckets))
                copy(to.NegativeBuckets, h.NegativeBuckets)

                to.CustomValues = clearIfNotNil(to.CustomValues)
        }

        to.PositiveSpans = resize(to.PositiveSpans, len(h.PositiveSpans))
        copy(to.PositiveSpans, h.PositiveSpans)

        to.PositiveBuckets = resize(to.PositiveBuckets, len(h.PositiveBuckets))
        copy(to.PositiveBuckets, h.PositiveBuckets)
}

// String returns a string representation of the Histogram.
func (h *Histogram) String() string {
        var sb strings.Builder
        fmt.Fprintf(&sb, "{count:%d, sum:%g", h.Count, h.Sum)

        var nBuckets []Bucket[uint64]
        for it := h.NegativeBucketIterator(); it.Next(); {
                bucket := it.At()
                if bucket.Count != 0 {
                        nBuckets = append(nBuckets, it.At())
                }
        }
        for i := len(nBuckets) - 1; i >= 0; i-- {
                fmt.Fprintf(&sb, ", %s", nBuckets[i].String())
        }

        if h.ZeroCount != 0 {
                fmt.Fprintf(&sb, ", %s", h.ZeroBucket().String())
        }

        for it := h.PositiveBucketIterator(); it.Next(); {
                bucket := it.At()
                if bucket.Count != 0 {
                        fmt.Fprintf(&sb, ", %s", bucket.String())
                }
        }

        sb.WriteRune('}')
        return sb.String()
}

// ZeroBucket returns the zero bucket. This method panics if the schema is for custom buckets.
func (h *Histogram) ZeroBucket() Bucket[uint64] {
        if h.UsesCustomBuckets() {
                panic("histograms with custom buckets have no zero bucket")
        }
        return Bucket[uint64]{
                Lower:          -h.ZeroThreshold,
                Upper:          h.ZeroThreshold,
                LowerInclusive: true,
                UpperInclusive: true,
                Count:          h.ZeroCount,
        }
}

// PositiveBucketIterator returns a BucketIterator to iterate over all positive
// buckets in ascending order (starting next to the zero bucket and going up).
func (h *Histogram) PositiveBucketIterator() BucketIterator[uint64] {
        it := newRegularBucketIterator(h.PositiveSpans, h.PositiveBuckets, h.Schema, true, h.CustomValues)
        return &it
}

// NegativeBucketIterator returns a BucketIterator to iterate over all negative
// buckets in descending order (starting next to the zero bucket and going down).
func (h *Histogram) NegativeBucketIterator() BucketIterator[uint64] {
        it := newRegularBucketIterator(h.NegativeSpans, h.NegativeBuckets, h.Schema, false, nil)
        return &it
}

// CumulativeBucketIterator returns a BucketIterator to iterate over a
// cumulative view of the buckets. This method currently only supports
// Histograms without negative buckets and panics if the Histogram has negative
// buckets. It is currently only used for testing.
func (h *Histogram) CumulativeBucketIterator() BucketIterator[uint64] {
        if len(h.NegativeBuckets) > 0 {
                panic("CumulativeBucketIterator called on Histogram with negative buckets")
        }
        return &cumulativeBucketIterator{h: h, posSpansIdx: -1}
}

// Equals returns true if the given histogram matches exactly.
// Exact match is when there are no new buckets (even empty) and no missing buckets,
// and all the bucket values match. Spans can have different empty length spans in between,
// but they must represent the same bucket layout to match.
// Sum is compared based on its bit pattern because this method
// is about data equality rather than mathematical equality.
// We ignore fields that are not used based on the exponential / custom buckets schema,
// but check fields where differences may cause unintended behaviour even if they are not
// supposed to be used according to the schema.
func (h *Histogram) Equals(h2 *Histogram) bool {
        if h2 == nil {
                return false
        }

        if h.Schema != h2.Schema || h.Count != h2.Count ||
                math.Float64bits(h.Sum) != math.Float64bits(h2.Sum) {
                return false
        }

        if h.UsesCustomBuckets() {
                if !FloatBucketsMatch(h.CustomValues, h2.CustomValues) {
                        return false
                }
        }

        if h.ZeroThreshold != h2.ZeroThreshold || h.ZeroCount != h2.ZeroCount {
                return false
        }

        if !spansMatch(h.NegativeSpans, h2.NegativeSpans) {
                return false
        }
        if !slices.Equal(h.NegativeBuckets, h2.NegativeBuckets) {
                return false
        }

        if !spansMatch(h.PositiveSpans, h2.PositiveSpans) {
                return false
        }
        if !slices.Equal(h.PositiveBuckets, h2.PositiveBuckets) {
                return false
        }

        return true
}

// spansMatch returns true if both spans represent the same bucket layout
// after combining zero length spans with the next non-zero length span.
func spansMatch(s1, s2 []Span) bool {
        if len(s1) == 0 && len(s2) == 0 {
                return true
        }

        s1idx, s2idx := 0, 0
        for {
                if s1idx >= len(s1) {
                        return allEmptySpans(s2[s2idx:])
                }
                if s2idx >= len(s2) {
                        return allEmptySpans(s1[s1idx:])
                }

                currS1, currS2 := s1[s1idx], s2[s2idx]
                s1idx++
                s2idx++
                if currS1.Length == 0 {
                        // This span is zero length, so we add consecutive such spans
                        // until we find a non-zero span.
                        for ; s1idx < len(s1) && s1[s1idx].Length == 0; s1idx++ {
                                currS1.Offset += s1[s1idx].Offset
                        }
                        if s1idx < len(s1) {
                                currS1.Offset += s1[s1idx].Offset
                                currS1.Length = s1[s1idx].Length
                                s1idx++
                        }
                }
                if currS2.Length == 0 {
                        // This span is zero length, so we add consecutive such spans
                        // until we find a non-zero span.
                        for ; s2idx < len(s2) && s2[s2idx].Length == 0; s2idx++ {
                                currS2.Offset += s2[s2idx].Offset
                        }
                        if s2idx < len(s2) {
                                currS2.Offset += s2[s2idx].Offset
                                currS2.Length = s2[s2idx].Length
                                s2idx++
                        }
                }

                if currS1.Length == 0 && currS2.Length == 0 {
                        // The last spans of both set are zero length. Previous spans match.
                        return true
                }

                if currS1.Offset != currS2.Offset || currS1.Length != currS2.Length {
                        return false
                }
        }
}

func allEmptySpans(s []Span) bool {
        for _, ss := range s {
                if ss.Length > 0 {
                        return false
                }
        }
        return true
}

// Compact works like FloatHistogram.Compact. See there for detailed
// explanations.
func (h *Histogram) Compact(maxEmptyBuckets int) *Histogram {
        h.PositiveBuckets, h.PositiveSpans = compactBuckets(
                h.PositiveBuckets, h.PositiveSpans, maxEmptyBuckets, true,
        )
        h.NegativeBuckets, h.NegativeSpans = compactBuckets(
                h.NegativeBuckets, h.NegativeSpans, maxEmptyBuckets, true,
        )
        return h
}

// ToFloat returns a FloatHistogram representation of the Histogram. It is a deep
// copy (e.g. spans are not shared). The function accepts a FloatHistogram as an
// argument whose memory will be reused and overwritten if provided. If this
// argument is nil, a new FloatHistogram will be allocated.
func (h *Histogram) ToFloat(fh *FloatHistogram) *FloatHistogram {
        if fh == nil {
                fh = &FloatHistogram{}
        }
        fh.CounterResetHint = h.CounterResetHint
        fh.Schema = h.Schema
        fh.Count = float64(h.Count)
        fh.Sum = h.Sum

        if h.UsesCustomBuckets() {
                fh.ZeroThreshold = 0
                fh.ZeroCount = 0
                fh.NegativeSpans = clearIfNotNil(fh.NegativeSpans)
                fh.NegativeBuckets = clearIfNotNil(fh.NegativeBuckets)

                fh.CustomValues = resize(fh.CustomValues, len(h.CustomValues))
                copy(fh.CustomValues, h.CustomValues)
        } else {
                fh.ZeroThreshold = h.ZeroThreshold
                fh.ZeroCount = float64(h.ZeroCount)

                fh.NegativeSpans = resize(fh.NegativeSpans, len(h.NegativeSpans))
                copy(fh.NegativeSpans, h.NegativeSpans)

                fh.NegativeBuckets = resize(fh.NegativeBuckets, len(h.NegativeBuckets))
                var currentNegative float64
                for i, b := range h.NegativeBuckets {
                        currentNegative += float64(b)
                        fh.NegativeBuckets[i] = currentNegative
                }
                fh.CustomValues = clearIfNotNil(fh.CustomValues)
        }

        fh.PositiveSpans = resize(fh.PositiveSpans, len(h.PositiveSpans))
        copy(fh.PositiveSpans, h.PositiveSpans)

        fh.PositiveBuckets = resize(fh.PositiveBuckets, len(h.PositiveBuckets))
        var currentPositive float64
        for i, b := range h.PositiveBuckets {
                currentPositive += float64(b)
                fh.PositiveBuckets[i] = currentPositive
        }

        return fh
}

func resize[T any](items []T, n int) []T {
        if cap(items) < n {
                return make([]T, n)
        }
        return items[:n]
}

// Validate validates consistency between span and bucket slices. Also, buckets are checked
// against negative values. We check to make sure there are no unexpected fields or field values
// based on the exponential / custom buckets schema.
// For histograms that have not observed any NaN values (based on IsNaN(h.Sum) check), a
// strict h.Count = nCount + pCount + h.ZeroCount check is performed.
// Otherwise, only a lower bound check will be done (h.Count >= nCount + pCount + h.ZeroCount),
// because NaN observations do not increment the values of buckets (but they do increment
// the total h.Count).
func (h *Histogram) Validate() error {
        var nCount, pCount uint64
        if h.UsesCustomBuckets() {
                if err := checkHistogramCustomBounds(h.CustomValues, h.PositiveSpans, len(h.PositiveBuckets)); err != nil {
                        return fmt.Errorf("custom buckets: %w", err)
                }
                if h.ZeroCount != 0 {
                        return fmt.Errorf("custom buckets: must have zero count of 0")
                }
                if h.ZeroThreshold != 0 {
                        return fmt.Errorf("custom buckets: must have zero threshold of 0")
                }
                if len(h.NegativeSpans) > 0 {
                        return fmt.Errorf("custom buckets: must not have negative spans")
                }
                if len(h.NegativeBuckets) > 0 {
                        return fmt.Errorf("custom buckets: must not have negative buckets")
                }
        } else {
                if err := checkHistogramSpans(h.PositiveSpans, len(h.PositiveBuckets)); err != nil {
                        return fmt.Errorf("positive side: %w", err)
                }
                if err := checkHistogramSpans(h.NegativeSpans, len(h.NegativeBuckets)); err != nil {
                        return fmt.Errorf("negative side: %w", err)
                }
                err := checkHistogramBuckets(h.NegativeBuckets, &nCount, true)
                if err != nil {
                        return fmt.Errorf("negative side: %w", err)
                }
                if h.CustomValues != nil {
                        return fmt.Errorf("histogram with exponential schema must not have custom bounds")
                }
        }
        err := checkHistogramBuckets(h.PositiveBuckets, &pCount, true)
        if err != nil {
                return fmt.Errorf("positive side: %w", err)
        }

        sumOfBuckets := nCount + pCount + h.ZeroCount
        if math.IsNaN(h.Sum) {
                if sumOfBuckets > h.Count {
                        return fmt.Errorf("%d observations found in buckets, but the Count field is %d: %w", sumOfBuckets, h.Count, ErrHistogramCountNotBigEnough)
                }
        } else {
                if sumOfBuckets != h.Count {
                        return fmt.Errorf("%d observations found in buckets, but the Count field is %d: %w", sumOfBuckets, h.Count, ErrHistogramCountMismatch)
                }
        }

        return nil
}

type regularBucketIterator struct {
        baseBucketIterator[uint64, int64]
}

func newRegularBucketIterator(spans []Span, buckets []int64, schema int32, positive bool, customValues []float64) regularBucketIterator {
        i := baseBucketIterator[uint64, int64]{
                schema:       schema,
                spans:        spans,
                buckets:      buckets,
                positive:     positive,
                customValues: customValues,
        }
        return regularBucketIterator{i}
}

func (r *regularBucketIterator) Next() bool {
        if r.spansIdx >= len(r.spans) {
                return false
        }
        span := r.spans[r.spansIdx]
        // Seed currIdx for the first bucket.
        if r.bucketsIdx == 0 {
                r.currIdx = span.Offset
        } else {
                r.currIdx++
        }
        for r.idxInSpan >= span.Length {
                // We have exhausted the current span and have to find a new
                // one. We'll even handle pathologic spans of length 0.
                r.idxInSpan = 0
                r.spansIdx++
                if r.spansIdx >= len(r.spans) {
                        return false
                }
                span = r.spans[r.spansIdx]
                r.currIdx += span.Offset
        }

        r.currCount += r.buckets[r.bucketsIdx]
        r.idxInSpan++
        r.bucketsIdx++
        return true
}

type cumulativeBucketIterator struct {
        h *Histogram

        posSpansIdx   int    // Index in h.PositiveSpans we are in. -1 means 0 bucket.
        posBucketsIdx int    // Index in h.PositiveBuckets.
        idxInSpan     uint32 // Index in the current span. 0 <= idxInSpan < span.Length.

        initialized         bool
        currIdx             int32   // The actual bucket index after decoding from spans.
        currUpper           float64 // The upper boundary of the current bucket.
        currCount           int64   // Current non-cumulative count for the current bucket. Does not apply for empty bucket.
        currCumulativeCount uint64  // Current "cumulative" count for the current bucket.

        // Between 2 spans there could be some empty buckets which
        // still needs to be counted for cumulative buckets.
        // When we hit the end of a span, we use this to iterate
        // through the empty buckets.
        emptyBucketCount int32
}

func (c *cumulativeBucketIterator) Next() bool {
        if c.posSpansIdx == -1 {
                // Zero bucket.
                c.posSpansIdx++
                if c.h.ZeroCount == 0 {
                        return c.Next()
                }

                c.currUpper = c.h.ZeroThreshold
                c.currCount = int64(c.h.ZeroCount)
                c.currCumulativeCount = uint64(c.currCount)
                return true
        }

        if c.posSpansIdx >= len(c.h.PositiveSpans) {
                return false
        }

        if c.emptyBucketCount > 0 {
                // We are traversing through empty buckets at the moment.
                c.currUpper = getBound(c.currIdx, c.h.Schema, c.h.CustomValues)
                c.currIdx++
                c.emptyBucketCount--
                return true
        }

        span := c.h.PositiveSpans[c.posSpansIdx]
        if c.posSpansIdx == 0 && !c.initialized {
                // Initializing.
                c.currIdx = span.Offset
                // The first bucket is an absolute value and not a delta with Zero bucket.
                c.currCount = 0
                c.initialized = true
        }

        c.currCount += c.h.PositiveBuckets[c.posBucketsIdx]
        c.currCumulativeCount += uint64(c.currCount)
        c.currUpper = getBound(c.currIdx, c.h.Schema, c.h.CustomValues)

        c.posBucketsIdx++
        c.idxInSpan++
        c.currIdx++
        if c.idxInSpan >= span.Length {
                // Move to the next span. This one is done.
                c.posSpansIdx++
                c.idxInSpan = 0
                if c.posSpansIdx < len(c.h.PositiveSpans) {
                        c.emptyBucketCount = c.h.PositiveSpans[c.posSpansIdx].Offset
                }
        }

        return true
}

func (c *cumulativeBucketIterator) At() Bucket[uint64] {
        return Bucket[uint64]{
                Upper:          c.currUpper,
                Lower:          math.Inf(-1),
                UpperInclusive: true,
                LowerInclusive: true,
                Count:          c.currCumulativeCount,
                Index:          c.currIdx - 1,
        }
}

// ReduceResolution reduces the histogram's spans, buckets into target schema.
// The target schema must be smaller than the current histogram's schema.
// This will panic if the histogram has custom buckets or if the target schema is
// a custom buckets schema.
func (h *Histogram) ReduceResolution(targetSchema int32) *Histogram {
        if h.UsesCustomBuckets() {
                panic("cannot reduce resolution when there are custom buckets")
        }
        if IsCustomBucketsSchema(targetSchema) {
                panic("cannot reduce resolution to custom buckets schema")
        }
        if targetSchema >= h.Schema {
                panic(fmt.Errorf("cannot reduce resolution from schema %d to %d", h.Schema, targetSchema))
        }

        h.PositiveSpans, h.PositiveBuckets = reduceResolution(
                h.PositiveSpans, h.PositiveBuckets, h.Schema, targetSchema, true, true,
        )
        h.NegativeSpans, h.NegativeBuckets = reduceResolution(
                h.NegativeSpans, h.NegativeBuckets, h.Schema, targetSchema, true, true,
        )
        h.Schema = targetSchema
        return h
}

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package histogram

// GenerateBigTestHistograms generates a slice of histograms with given number of buckets each.
func GenerateBigTestHistograms(numHistograms, numBuckets int) []*Histogram {
        numSpans := numBuckets / 10
        bucketsPerSide := numBuckets / 2
        spanLength := uint32(bucketsPerSide / numSpans)
        // Given all bucket deltas are 1, sum bucketsPerSide + 1.
        observationCount := bucketsPerSide * (1 + bucketsPerSide)

        var histograms []*Histogram
        for i := 0; i < numHistograms; i++ {
                h := &Histogram{
                        Count:           uint64(i + observationCount),
                        ZeroCount:       uint64(i),
                        ZeroThreshold:   1e-128,
                        Sum:             18.4 * float64(i+1),
                        Schema:          2,
                        NegativeSpans:   make([]Span, numSpans),
                        PositiveSpans:   make([]Span, numSpans),
                        NegativeBuckets: make([]int64, bucketsPerSide),
                        PositiveBuckets: make([]int64, bucketsPerSide),
                }

                for j := 0; j < numSpans; j++ {
                        s := Span{Offset: 1, Length: spanLength}
                        h.NegativeSpans[j] = s
                        h.PositiveSpans[j] = s
                }

                for j := 0; j < bucketsPerSide; j++ {
                        h.NegativeBuckets[j] = 1
                        h.PositiveBuckets[j] = 1
                }

                histograms = append(histograms, h)
        }
        return histograms
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !stringlabels && !dedupelabels

package labels

import (
        "bytes"
        "slices"
        "strings"

        "github.com/cespare/xxhash/v2"
)

// Labels is a sorted set of labels. Order has to be guaranteed upon
// instantiation.
type Labels []Label

func (ls Labels) Len() int           { return len(ls) }
func (ls Labels) Swap(i, j int)      { ls[i], ls[j] = ls[j], ls[i] }
func (ls Labels) Less(i, j int) bool { return ls[i].Name < ls[j].Name }

// Bytes returns ls as a byte slice.
// It uses an byte invalid character as a separator and so should not be used for printing.
func (ls Labels) Bytes(buf []byte) []byte {
        b := bytes.NewBuffer(buf[:0])
        b.WriteByte(labelSep)
        for i, l := range ls {
                if i > 0 {
                        b.WriteByte(seps[0])
                }
                b.WriteString(l.Name)
                b.WriteByte(seps[0])
                b.WriteString(l.Value)
        }
        return b.Bytes()
}

// MatchLabels returns a subset of Labels that matches/does not match with the provided label names based on the 'on' boolean.
// If on is set to true, it returns the subset of labels that match with the provided label names and its inverse when 'on' is set to false.
func (ls Labels) MatchLabels(on bool, names ...string) Labels {
        matchedLabels := Labels{}

        nameSet := make(map[string]struct{}, len(names))
        for _, n := range names {
                nameSet[n] = struct{}{}
        }

        for _, v := range ls {
                if _, ok := nameSet[v.Name]; on == ok && (on || v.Name != MetricName) {
                        matchedLabels = append(matchedLabels, v)
                }
        }

        return matchedLabels
}

// Hash returns a hash value for the label set.
// Note: the result is not guaranteed to be consistent across different runs of Prometheus.
func (ls Labels) Hash() uint64 {
        // Use xxhash.Sum64(b) for fast path as it's faster.
        b := make([]byte, 0, 1024)
        for i, v := range ls {
                if len(b)+len(v.Name)+len(v.Value)+2 >= cap(b) {
                        // If labels entry is 1KB+ do not allocate whole entry.
                        h := xxhash.New()
                        _, _ = h.Write(b)
                        for _, v := range ls[i:] {
                                _, _ = h.WriteString(v.Name)
                                _, _ = h.Write(seps)
                                _, _ = h.WriteString(v.Value)
                                _, _ = h.Write(seps)
                        }
                        return h.Sum64()
                }

                b = append(b, v.Name...)
                b = append(b, seps[0])
                b = append(b, v.Value...)
                b = append(b, seps[0])
        }
        return xxhash.Sum64(b)
}

// HashForLabels returns a hash value for the labels matching the provided names.
// 'names' have to be sorted in ascending order.
func (ls Labels) HashForLabels(b []byte, names ...string) (uint64, []byte) {
        b = b[:0]
        i, j := 0, 0
        for i < len(ls) && j < len(names) {
                switch {
                case names[j] < ls[i].Name:
                        j++
                case ls[i].Name < names[j]:
                        i++
                default:
                        b = append(b, ls[i].Name...)
                        b = append(b, seps[0])
                        b = append(b, ls[i].Value...)
                        b = append(b, seps[0])
                        i++
                        j++
                }
        }
        return xxhash.Sum64(b), b
}

// HashWithoutLabels returns a hash value for all labels except those matching
// the provided names.
// 'names' have to be sorted in ascending order.
func (ls Labels) HashWithoutLabels(b []byte, names ...string) (uint64, []byte) {
        b = b[:0]
        j := 0
        for i := range ls {
                for j < len(names) && names[j] < ls[i].Name {
                        j++
                }
                if ls[i].Name == MetricName || (j < len(names) && ls[i].Name == names[j]) {
                        continue
                }
                b = append(b, ls[i].Name...)
                b = append(b, seps[0])
                b = append(b, ls[i].Value...)
                b = append(b, seps[0])
        }
        return xxhash.Sum64(b), b
}

// BytesWithLabels is just as Bytes(), but only for labels matching names.
// 'names' have to be sorted in ascending order.
func (ls Labels) BytesWithLabels(buf []byte, names ...string) []byte {
        b := bytes.NewBuffer(buf[:0])
        b.WriteByte(labelSep)
        i, j := 0, 0
        for i < len(ls) && j < len(names) {
                switch {
                case names[j] < ls[i].Name:
                        j++
                case ls[i].Name < names[j]:
                        i++
                default:
                        if b.Len() > 1 {
                                b.WriteByte(seps[0])
                        }
                        b.WriteString(ls[i].Name)
                        b.WriteByte(seps[0])
                        b.WriteString(ls[i].Value)
                        i++
                        j++
                }
        }
        return b.Bytes()
}

// BytesWithoutLabels is just as Bytes(), but only for labels not matching names.
// 'names' have to be sorted in ascending order.
func (ls Labels) BytesWithoutLabels(buf []byte, names ...string) []byte {
        b := bytes.NewBuffer(buf[:0])
        b.WriteByte(labelSep)
        j := 0
        for i := range ls {
                for j < len(names) && names[j] < ls[i].Name {
                        j++
                }
                if j < len(names) && ls[i].Name == names[j] {
                        continue
                }
                if b.Len() > 1 {
                        b.WriteByte(seps[0])
                }
                b.WriteString(ls[i].Name)
                b.WriteByte(seps[0])
                b.WriteString(ls[i].Value)
        }
        return b.Bytes()
}

// Copy returns a copy of the labels.
func (ls Labels) Copy() Labels {
        res := make(Labels, len(ls))
        copy(res, ls)
        return res
}

// Get returns the value for the label with the given name.
// Returns an empty string if the label doesn't exist.
func (ls Labels) Get(name string) string {
        for _, l := range ls {
                if l.Name == name {
                        return l.Value
                }
        }
        return ""
}

// Has returns true if the label with the given name is present.
func (ls Labels) Has(name string) bool {
        for _, l := range ls {
                if l.Name == name {
                        return true
                }
        }
        return false
}

// HasDuplicateLabelNames returns whether ls has duplicate label names.
// It assumes that the labelset is sorted.
func (ls Labels) HasDuplicateLabelNames() (string, bool) {
        for i, l := range ls {
                if i == 0 {
                        continue
                }
                if l.Name == ls[i-1].Name {
                        return l.Name, true
                }
        }
        return "", false
}

// WithoutEmpty returns the labelset without empty labels.
// May return the same labelset.
func (ls Labels) WithoutEmpty() Labels {
        for _, v := range ls {
                if v.Value != "" {
                        continue
                }
                // Do not copy the slice until it's necessary.
                els := make(Labels, 0, len(ls)-1)
                for _, v := range ls {
                        if v.Value != "" {
                                els = append(els, v)
                        }
                }
                return els
        }
        return ls
}

// Equal returns whether the two label sets are equal.
func Equal(ls, o Labels) bool {
        if len(ls) != len(o) {
                return false
        }
        for i, l := range ls {
                if l != o[i] {
                        return false
                }
        }
        return true
}

// EmptyLabels returns n empty Labels value, for convenience.
func EmptyLabels() Labels {
        return Labels{}
}

// New returns a sorted Labels from the given labels.
// The caller has to guarantee that all label names are unique.
func New(ls ...Label) Labels {
        set := make(Labels, 0, len(ls))
        set = append(set, ls...)
        slices.SortFunc(set, func(a, b Label) int { return strings.Compare(a.Name, b.Name) })

        return set
}

// FromStrings creates new labels from pairs of strings.
func FromStrings(ss ...string) Labels {
        if len(ss)%2 != 0 {
                panic("invalid number of strings")
        }
        res := make(Labels, 0, len(ss)/2)
        for i := 0; i < len(ss); i += 2 {
                res = append(res, Label{Name: ss[i], Value: ss[i+1]})
        }

        slices.SortFunc(res, func(a, b Label) int { return strings.Compare(a.Name, b.Name) })
        return res
}

// Compare compares the two label sets.
// The result will be 0 if a==b, <0 if a < b, and >0 if a > b.
func Compare(a, b Labels) int {
        l := len(a)
        if len(b) < l {
                l = len(b)
        }

        for i := 0; i < l; i++ {
                if a[i].Name != b[i].Name {
                        if a[i].Name < b[i].Name {
                                return -1
                        }
                        return 1
                }
                if a[i].Value != b[i].Value {
                        if a[i].Value < b[i].Value {
                                return -1
                        }
                        return 1
                }
        }
        // If all labels so far were in common, the set with fewer labels comes first.
        return len(a) - len(b)
}

// Copy labels from b on top of whatever was in ls previously, reusing memory or expanding if needed.
func (ls *Labels) CopyFrom(b Labels) {
        (*ls) = append((*ls)[:0], b...)
}

// IsEmpty returns true if ls represents an empty set of labels.
func (ls Labels) IsEmpty() bool {
        return len(ls) == 0
}

// Range calls f on each label.
func (ls Labels) Range(f func(l Label)) {
        for _, l := range ls {
                f(l)
        }
}

// Validate calls f on each label. If f returns a non-nil error, then it returns that error cancelling the iteration.
func (ls Labels) Validate(f func(l Label) error) error {
        for _, l := range ls {
                if err := f(l); err != nil {
                        return err
                }
        }
        return nil
}

// DropMetricName returns Labels with "__name__" removed.
func (ls Labels) DropMetricName() Labels {
        for i, l := range ls {
                if l.Name == MetricName {
                        if i == 0 { // Make common case fast with no allocations.
                                return ls[1:]
                        }
                        // Avoid modifying original Labels - use [:i:i] so that left slice would not
                        // have any spare capacity and append would have to allocate a new slice for the result.
                        return append(ls[:i:i], ls[i+1:]...)
                }
        }
        return ls
}

// InternStrings calls intern on every string value inside ls, replacing them with what it returns.
func (ls *Labels) InternStrings(intern func(string) string) {
        for i, l := range *ls {
                (*ls)[i].Name = intern(l.Name)
                (*ls)[i].Value = intern(l.Value)
        }
}

// ReleaseStrings calls release on every string value inside ls.
func (ls Labels) ReleaseStrings(release func(string)) {
        for _, l := range ls {
                release(l.Name)
                release(l.Value)
        }
}

// Builder allows modifying Labels.
type Builder struct {
        base Labels
        del  []string
        add  []Label
}

// Reset clears all current state for the builder.
func (b *Builder) Reset(base Labels) {
        b.base = base
        b.del = b.del[:0]
        b.add = b.add[:0]
        b.base.Range(func(l Label) {
                if l.Value == "" {
                        b.del = append(b.del, l.Name)
                }
        })
}

// Labels returns the labels from the builder.
// If no modifications were made, the original labels are returned.
func (b *Builder) Labels() Labels {
        if len(b.del) == 0 && len(b.add) == 0 {
                return b.base
        }

        expectedSize := len(b.base) + len(b.add) - len(b.del)
        if expectedSize < 1 {
                expectedSize = 1
        }
        res := make(Labels, 0, expectedSize)
        for _, l := range b.base {
                if slices.Contains(b.del, l.Name) || contains(b.add, l.Name) {
                        continue
                }
                res = append(res, l)
        }
        if len(b.add) > 0 { // Base is already in order, so we only need to sort if we add to it.
                res = append(res, b.add...)
                slices.SortFunc(res, func(a, b Label) int { return strings.Compare(a.Name, b.Name) })
        }
        return res
}

// ScratchBuilder allows efficient construction of a Labels from scratch.
type ScratchBuilder struct {
        add Labels
}

// Symbol-table is no-op, just for api parity with dedupelabels.
type SymbolTable struct{}

func NewSymbolTable() *SymbolTable { return nil }

func (t *SymbolTable) Len() int { return 0 }

// NewScratchBuilder creates a ScratchBuilder initialized for Labels with n entries.
func NewScratchBuilder(n int) ScratchBuilder {
        return ScratchBuilder{add: make([]Label, 0, n)}
}

// NewBuilderWithSymbolTable creates a Builder, for api parity with dedupelabels.
func NewBuilderWithSymbolTable(_ *SymbolTable) *Builder {
        return NewBuilder(EmptyLabels())
}

// NewScratchBuilderWithSymbolTable creates a ScratchBuilder, for api parity with dedupelabels.
func NewScratchBuilderWithSymbolTable(_ *SymbolTable, n int) ScratchBuilder {
        return NewScratchBuilder(n)
}

func (b *ScratchBuilder) SetSymbolTable(_ *SymbolTable) {
        // no-op
}

func (b *ScratchBuilder) Reset() {
        b.add = b.add[:0]
}

// Add a name/value pair.
// Note if you Add the same name twice you will get a duplicate label, which is invalid.
func (b *ScratchBuilder) Add(name, value string) {
        b.add = append(b.add, Label{Name: name, Value: value})
}

// Add a name/value pair, using []byte instead of string.
// The '-tags stringlabels' version of this function is unsafe, hence the name.
// This version is safe - it copies the strings immediately - but we keep the same name so everything compiles.
func (b *ScratchBuilder) UnsafeAddBytes(name, value []byte) {
        b.add = append(b.add, Label{Name: string(name), Value: string(value)})
}

// Sort the labels added so far by name.
func (b *ScratchBuilder) Sort() {
        slices.SortFunc(b.add, func(a, b Label) int { return strings.Compare(a.Name, b.Name) })
}

// Assign is for when you already have a Labels which you want this ScratchBuilder to return.
func (b *ScratchBuilder) Assign(ls Labels) {
        b.add = append(b.add[:0], ls...) // Copy on top of our slice, so we don't retain the input slice.
}

// Return the name/value pairs added so far as a Labels object.
// Note: if you want them sorted, call Sort() first.
func (b *ScratchBuilder) Labels() Labels {
        // Copy the slice, so the next use of ScratchBuilder doesn't overwrite.
        return append([]Label{}, b.add...)
}

// Write the newly-built Labels out to ls.
// Callers must ensure that there are no other references to ls, or any strings fetched from it.
func (b *ScratchBuilder) Overwrite(ls *Labels) {
        *ls = append((*ls)[:0], b.add...)
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package labels

import (
        "bytes"
        "encoding/json"
        "slices"
        "strconv"
        "unsafe"

        "github.com/prometheus/common/model"
)

const (
        MetricName   = "__name__"
        AlertName    = "alertname"
        BucketLabel  = "le"
        InstanceName = "instance"

        labelSep = '\xfe'
)

var seps = []byte{'\xff'}

// Label is a key/value pair of strings.
type Label struct {
        Name, Value string
}

func (ls Labels) String() string {
        var bytea [1024]byte // On stack to avoid memory allocation while building the output.
        b := bytes.NewBuffer(bytea[:0])

        b.WriteByte('{')
        i := 0
        ls.Range(func(l Label) {
                if i > 0 {
                        b.WriteByte(',')
                        b.WriteByte(' ')
                }
                b.WriteString(l.Name)
                b.WriteByte('=')
                b.Write(strconv.AppendQuote(b.AvailableBuffer(), l.Value))
                i++
        })
        b.WriteByte('}')
        return b.String()
}

// MarshalJSON implements json.Marshaler.
func (ls Labels) MarshalJSON() ([]byte, error) {
        return json.Marshal(ls.Map())
}

// UnmarshalJSON implements json.Unmarshaler.
func (ls *Labels) UnmarshalJSON(b []byte) error {
        var m map[string]string

        if err := json.Unmarshal(b, &m); err != nil {
                return err
        }

        *ls = FromMap(m)
        return nil
}

// MarshalYAML implements yaml.Marshaler.
func (ls Labels) MarshalYAML() (interface{}, error) {
        return ls.Map(), nil
}

// UnmarshalYAML implements yaml.Unmarshaler.
func (ls *Labels) UnmarshalYAML(unmarshal func(interface{}) error) error {
        var m map[string]string

        if err := unmarshal(&m); err != nil {
                return err
        }

        *ls = FromMap(m)
        return nil
}

// IsValid checks if the metric name or label names are valid.
func (ls Labels) IsValid() bool {
        err := ls.Validate(func(l Label) error {
                if l.Name == model.MetricNameLabel && !model.IsValidMetricName(model.LabelValue(l.Value)) {
                        return strconv.ErrSyntax
                }
                if !model.LabelName(l.Name).IsValid() || !model.LabelValue(l.Value).IsValid() {
                        return strconv.ErrSyntax
                }
                return nil
        })
        return err == nil
}

// Map returns a string map of the labels.
func (ls Labels) Map() map[string]string {
        m := make(map[string]string)
        ls.Range(func(l Label) {
                m[l.Name] = l.Value
        })
        return m
}

// FromMap returns new sorted Labels from the given map.
func FromMap(m map[string]string) Labels {
        l := make([]Label, 0, len(m))
        for k, v := range m {
                l = append(l, Label{Name: k, Value: v})
        }
        return New(l...)
}

// NewBuilder returns a new LabelsBuilder.
func NewBuilder(base Labels) *Builder {
        b := &Builder{
                del: make([]string, 0, 5),
                add: make([]Label, 0, 5),
        }
        b.Reset(base)
        return b
}

// Del deletes the label of the given name.
func (b *Builder) Del(ns ...string) *Builder {
        for _, n := range ns {
                for i, a := range b.add {
                        if a.Name == n {
                                b.add = append(b.add[:i], b.add[i+1:]...)
                        }
                }
                b.del = append(b.del, n)
        }
        return b
}

// Keep removes all labels from the base except those with the given names.
func (b *Builder) Keep(ns ...string) *Builder {
        b.base.Range(func(l Label) {
                for _, n := range ns {
                        if l.Name == n {
                                return
                        }
                }
                b.del = append(b.del, l.Name)
        })
        return b
}

// Set the name/value pair as a label. A value of "" means delete that label.
func (b *Builder) Set(n, v string) *Builder {
        if v == "" {
                // Empty labels are the same as missing labels.
                return b.Del(n)
        }
        for i, a := range b.add {
                if a.Name == n {
                        b.add[i].Value = v
                        return b
                }
        }
        b.add = append(b.add, Label{Name: n, Value: v})

        return b
}

func (b *Builder) Get(n string) string {
        // Del() removes entries from .add but Set() does not remove from .del, so check .add first.
        for _, a := range b.add {
                if a.Name == n {
                        return a.Value
                }
        }
        if slices.Contains(b.del, n) {
                return ""
        }
        return b.base.Get(n)
}

// Range calls f on each label in the Builder.
func (b *Builder) Range(f func(l Label)) {
        // Stack-based arrays to avoid heap allocation in most cases.
        var addStack [128]Label
        var delStack [128]string
        // Take a copy of add and del, so they are unaffected by calls to Set() or Del().
        origAdd, origDel := append(addStack[:0], b.add...), append(delStack[:0], b.del...)
        b.base.Range(func(l Label) {
                if !slices.Contains(origDel, l.Name) && !contains(origAdd, l.Name) {
                        f(l)
                }
        })
        for _, a := range origAdd {
                f(a)
        }
}

func contains(s []Label, n string) bool {
        for _, a := range s {
                if a.Name == n {
                        return true
                }
        }
        return false
}

func yoloString(b []byte) string {
        return *((*string)(unsafe.Pointer(&b)))
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package labels

import (
        "bytes"
        "strconv"
)

// MatchType is an enum for label matching types.
type MatchType int

// Possible MatchTypes.
const (
        MatchEqual MatchType = iota
        MatchNotEqual
        MatchRegexp
        MatchNotRegexp
)

var matchTypeToStr = [...]string{
        MatchEqual:     "=",
        MatchNotEqual:  "!=",
        MatchRegexp:    "=~",
        MatchNotRegexp: "!~",
}

func (m MatchType) String() string {
        if m < MatchEqual || m > MatchNotRegexp {
                panic("unknown match type")
        }
        return matchTypeToStr[m]
}

// Matcher models the matching of a label.
type Matcher struct {
        Type  MatchType
        Name  string
        Value string

        re *FastRegexMatcher
}

// NewMatcher returns a matcher object.
func NewMatcher(t MatchType, n, v string) (*Matcher, error) {
        m := &Matcher{
                Type:  t,
                Name:  n,
                Value: v,
        }
        if t == MatchRegexp || t == MatchNotRegexp {
                re, err := NewFastRegexMatcher(v)
                if err != nil {
                        return nil, err
                }
                m.re = re
        }
        return m, nil
}

// MustNewMatcher panics on error - only for use in tests!
func MustNewMatcher(mt MatchType, name, val string) *Matcher {
        m, err := NewMatcher(mt, name, val)
        if err != nil {
                panic(err)
        }
        return m
}

func (m *Matcher) String() string {
        // Start a buffer with a pre-allocated size on stack to cover most needs.
        var bytea [1024]byte
        b := bytes.NewBuffer(bytea[:0])

        if m.shouldQuoteName() {
                b.Write(strconv.AppendQuote(b.AvailableBuffer(), m.Name))
        } else {
                b.WriteString(m.Name)
        }
        b.WriteString(m.Type.String())
        b.Write(strconv.AppendQuote(b.AvailableBuffer(), m.Value))

        return b.String()
}

func (m *Matcher) shouldQuoteName() bool {
        for i, c := range m.Name {
                if c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (i > 0 && c >= '0' && c <= '9') {
                        continue
                }
                return true
        }
        return len(m.Name) == 0
}

// Matches returns whether the matcher matches the given string value.
func (m *Matcher) Matches(s string) bool {
        switch m.Type {
        case MatchEqual:
                return s == m.Value
        case MatchNotEqual:
                return s != m.Value
        case MatchRegexp:
                return m.re.MatchString(s)
        case MatchNotRegexp:
                return !m.re.MatchString(s)
        }
        panic("labels.Matcher.Matches: invalid match type")
}

// Inverse returns a matcher that matches the opposite.
func (m *Matcher) Inverse() (*Matcher, error) {
        switch m.Type {
        case MatchEqual:
                return NewMatcher(MatchNotEqual, m.Name, m.Value)
        case MatchNotEqual:
                return NewMatcher(MatchEqual, m.Name, m.Value)
        case MatchRegexp:
                return NewMatcher(MatchNotRegexp, m.Name, m.Value)
        case MatchNotRegexp:
                return NewMatcher(MatchRegexp, m.Name, m.Value)
        }
        panic("labels.Matcher.Matches: invalid match type")
}

// GetRegexString returns the regex string.
func (m *Matcher) GetRegexString() string {
        if m.re == nil {
                return ""
        }
        return m.re.GetRegexString()
}

// SetMatches returns a set of equality matchers for the current regex matchers if possible.
// For examples the regexp `a(b|f)` will returns "ab" and "af".
// Returns nil if we can't replace the regexp by only equality matchers.
func (m *Matcher) SetMatches() []string {
        if m.re == nil {
                return nil
        }
        return m.re.SetMatches()
}

// Prefix returns the required prefix of the value to match, if possible.
// It will be empty if it's an equality matcher or if the prefix can't be determined.
func (m *Matcher) Prefix() string {
        if m.re == nil {
                return ""
        }
        return m.re.prefix
}

// IsRegexOptimized returns whether regex is optimized.
func (m *Matcher) IsRegexOptimized() bool {
        if m.re == nil {
                return false
        }
        return m.re.IsOptimized()
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package labels

import (
        "slices"
        "strings"
        "unicode"
        "unicode/utf8"

        "github.com/grafana/regexp"
        "github.com/grafana/regexp/syntax"
        "golang.org/x/text/unicode/norm"
)

const (
        maxSetMatches = 256

        // The minimum number of alternate values a regex should have to trigger
        // the optimization done by optimizeEqualStringMatchers() and so use a map
        // to match values instead of iterating over a list. This value has
        // been computed running BenchmarkOptimizeEqualStringMatchers.
        minEqualMultiStringMatcherMapThreshold = 16
)

type FastRegexMatcher struct {
        // Under some conditions, re is nil because the expression is never parsed.
        // We store the original string to be able to return it in GetRegexString().
        reString string
        re       *regexp.Regexp

        setMatches    []string
        stringMatcher StringMatcher
        prefix        string
        suffix        string
        contains      []string

        // matchString is the "compiled" function to run by MatchString().
        matchString func(string) bool
}

func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
        m := &FastRegexMatcher{
                reString: v,
        }

        m.stringMatcher, m.setMatches = optimizeAlternatingLiterals(v)
        if m.stringMatcher != nil {
                // If we already have a string matcher, we don't need to parse the regex
                // or compile the matchString function. This also avoids the behavior in
                // compileMatchStringFunction where it prefers to use setMatches when
                // available, even if the string matcher is faster.
                m.matchString = m.stringMatcher.Matches
        } else {
                parsed, err := syntax.Parse(v, syntax.Perl)
                if err != nil {
                        return nil, err
                }
                // Simplify the syntax tree to run faster.
                parsed = parsed.Simplify()
                m.re, err = regexp.Compile("^(?:" + parsed.String() + ")$")
                if err != nil {
                        return nil, err
                }
                if parsed.Op == syntax.OpConcat {
                        m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
                }
                if matches, caseSensitive := findSetMatches(parsed); caseSensitive {
                        m.setMatches = matches
                }
                m.stringMatcher = stringMatcherFromRegexp(parsed)
                m.matchString = m.compileMatchStringFunction()
        }

        return m, nil
}

// compileMatchStringFunction returns the function to run by MatchString().
func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
        // If the only optimization available is the string matcher, then we can just run it.
        if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && len(m.contains) == 0 && m.stringMatcher != nil {
                return m.stringMatcher.Matches
        }

        return func(s string) bool {
                if len(m.setMatches) != 0 {
                        for _, match := range m.setMatches {
                                if match == s {
                                        return true
                                }
                        }
                        return false
                }
                if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
                        return false
                }
                if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
                        return false
                }
                if len(m.contains) > 0 && !containsInOrder(s, m.contains) {
                        return false
                }
                if m.stringMatcher != nil {
                        return m.stringMatcher.Matches(s)
                }
                return m.re.MatchString(s)
        }
}

// IsOptimized returns true if any fast-path optimization is applied to the
// regex matcher.
func (m *FastRegexMatcher) IsOptimized() bool {
        return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || len(m.contains) > 0
}

// findSetMatches extract equality matches from a regexp.
// Returns nil if we can't replace the regexp by only equality matchers or the regexp contains
// a mix of case sensitive and case insensitive matchers.
func findSetMatches(re *syntax.Regexp) (matches []string, caseSensitive bool) {
        clearBeginEndText(re)

        return findSetMatchesInternal(re, "")
}

func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) {
        switch re.Op {
        case syntax.OpBeginText:
                // Correctly handling the begin text operator inside a regex is tricky,
                // so in this case we fallback to the regex engine.
                return nil, false
        case syntax.OpEndText:
                // Correctly handling the end text operator inside a regex is tricky,
                // so in this case we fallback to the regex engine.
                return nil, false
        case syntax.OpLiteral:
                return []string{base + string(re.Rune)}, isCaseSensitive(re)
        case syntax.OpEmptyMatch:
                if base != "" {
                        return []string{base}, isCaseSensitive(re)
                }
        case syntax.OpAlternate:
                return findSetMatchesFromAlternate(re, base)
        case syntax.OpCapture:
                clearCapture(re)
                return findSetMatchesInternal(re, base)
        case syntax.OpConcat:
                return findSetMatchesFromConcat(re, base)
        case syntax.OpCharClass:
                if len(re.Rune)%2 != 0 {
                        return nil, false
                }
                var matches []string
                var totalSet int
                for i := 0; i+1 < len(re.Rune); i += 2 {
                        totalSet += int(re.Rune[i+1]-re.Rune[i]) + 1
                }
                // limits the total characters that can be used to create matches.
                // In some case like negation [^0-9] a lot of possibilities exists and that
                // can create thousands of possible matches at which points we're better off using regexp.
                if totalSet > maxSetMatches {
                        return nil, false
                }
                for i := 0; i+1 < len(re.Rune); i += 2 {
                        lo, hi := re.Rune[i], re.Rune[i+1]
                        for c := lo; c <= hi; c++ {
                                matches = append(matches, base+string(c))
                        }
                }
                return matches, isCaseSensitive(re)
        default:
                return nil, false
        }
        return nil, false
}

func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
        if len(re.Sub) == 0 {
                return nil, false
        }
        clearCapture(re.Sub...)

        matches = []string{base}

        for i := 0; i < len(re.Sub); i++ {
                var newMatches []string
                for j, b := range matches {
                        m, caseSensitive := findSetMatchesInternal(re.Sub[i], b)
                        if m == nil {
                                return nil, false
                        }
                        if tooManyMatches(newMatches, m...) {
                                return nil, false
                        }

                        // All matches must have the same case sensitivity. If it's the first set of matches
                        // returned, we store its sensitivity as the expected case, and then we'll check all
                        // other ones.
                        if i == 0 && j == 0 {
                                matchesCaseSensitive = caseSensitive
                        }
                        if matchesCaseSensitive != caseSensitive {
                                return nil, false
                        }

                        newMatches = append(newMatches, m...)
                }
                matches = newMatches
        }

        return matches, matchesCaseSensitive
}

func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
        for i, sub := range re.Sub {
                found, caseSensitive := findSetMatchesInternal(sub, base)
                if found == nil {
                        return nil, false
                }
                if tooManyMatches(matches, found...) {
                        return nil, false
                }

                // All matches must have the same case sensitivity. If it's the first set of matches
                // returned, we store its sensitivity as the expected case, and then we'll check all
                // other ones.
                if i == 0 {
                        matchesCaseSensitive = caseSensitive
                }
                if matchesCaseSensitive != caseSensitive {
                        return nil, false
                }

                matches = append(matches, found...)
        }

        return matches, matchesCaseSensitive
}

// clearCapture removes capture operation as they are not used for matching.
func clearCapture(regs ...*syntax.Regexp) {
        for _, r := range regs {
                // Iterate on the regexp because capture groups could be nested.
                for r.Op == syntax.OpCapture {
                        *r = *r.Sub[0]
                }
        }
}

// clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string.
func clearBeginEndText(re *syntax.Regexp) {
        // Do not clear begin/end text from an alternate operator because it could
        // change the actual regexp properties.
        if re.Op == syntax.OpAlternate {
                return
        }

        if len(re.Sub) == 0 {
                return
        }
        if len(re.Sub) == 1 {
                if re.Sub[0].Op == syntax.OpBeginText || re.Sub[0].Op == syntax.OpEndText {
                        // We need to remove this element. Since it's the only one, we convert into a matcher of an empty string.
                        // OpEmptyMatch is regexp's nop operator.
                        re.Op = syntax.OpEmptyMatch
                        re.Sub = nil
                        return
                }
        }
        if re.Sub[0].Op == syntax.OpBeginText {
                re.Sub = re.Sub[1:]
        }
        if re.Sub[len(re.Sub)-1].Op == syntax.OpEndText {
                re.Sub = re.Sub[:len(re.Sub)-1]
        }
}

// isCaseInsensitive tells if a regexp is case insensitive.
// The flag should be check at each level of the syntax tree.
func isCaseInsensitive(reg *syntax.Regexp) bool {
        return (reg.Flags & syntax.FoldCase) != 0
}

// isCaseSensitive tells if a regexp is case sensitive.
// The flag should be check at each level of the syntax tree.
func isCaseSensitive(reg *syntax.Regexp) bool {
        return !isCaseInsensitive(reg)
}

// tooManyMatches guards against creating too many set matches.
func tooManyMatches(matches []string, added ...string) bool {
        return len(matches)+len(added) > maxSetMatches
}

func (m *FastRegexMatcher) MatchString(s string) bool {
        return m.matchString(s)
}

func (m *FastRegexMatcher) SetMatches() []string {
        // IMPORTANT: always return a copy, otherwise if the caller manipulate this slice it will
        // also get manipulated in the cached FastRegexMatcher instance.
        return slices.Clone(m.setMatches)
}

func (m *FastRegexMatcher) GetRegexString() string {
        return m.reString
}

// optimizeAlternatingLiterals optimizes a regex of the form
//
//        `literal1|literal2|literal3|...`
//
// this function returns an optimized StringMatcher or nil if the regex
// cannot be optimized in this way, and a list of setMatches up to maxSetMatches.
func optimizeAlternatingLiterals(s string) (StringMatcher, []string) {
        if len(s) == 0 {
                return emptyStringMatcher{}, nil
        }

        estimatedAlternates := strings.Count(s, "|") + 1

        // If there are no alternates, check if the string is a literal
        if estimatedAlternates == 1 {
                if regexp.QuoteMeta(s) == s {
                        return &equalStringMatcher{s: s, caseSensitive: true}, []string{s}
                }
                return nil, nil
        }

        multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates)

        for end := strings.IndexByte(s, '|'); end > -1; end = strings.IndexByte(s, '|') {
                // Split the string into the next literal and the remainder
                subMatch := s[:end]
                s = s[end+1:]

                // break if any of the submatches are not literals
                if regexp.QuoteMeta(subMatch) != subMatch {
                        return nil, nil
                }

                multiMatcher.add(subMatch)
        }

        // break if the remainder is not a literal
        if regexp.QuoteMeta(s) != s {
                return nil, nil
        }
        multiMatcher.add(s)

        return multiMatcher, multiMatcher.setMatches()
}

// optimizeConcatRegex returns literal prefix/suffix text that can be safely
// checked against the label value before running the regexp matcher.
func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix string, contains []string) {
        sub := r.Sub
        clearCapture(sub...)

        // We can safely remove begin and end text matchers respectively
        // at the beginning and end of the regexp.
        if len(sub) > 0 && sub[0].Op == syntax.OpBeginText {
                sub = sub[1:]
        }
        if len(sub) > 0 && sub[len(sub)-1].Op == syntax.OpEndText {
                sub = sub[:len(sub)-1]
        }

        if len(sub) == 0 {
                return
        }

        // Given Prometheus regex matchers are always anchored to the begin/end
        // of the text, if the first/last operations are literals, we can safely
        // treat them as prefix/suffix.
        if sub[0].Op == syntax.OpLiteral && (sub[0].Flags&syntax.FoldCase) == 0 {
                prefix = string(sub[0].Rune)
        }
        if last := len(sub) - 1; sub[last].Op == syntax.OpLiteral && (sub[last].Flags&syntax.FoldCase) == 0 {
                suffix = string(sub[last].Rune)
        }

        // If contains any literal which is not a prefix/suffix, we keep track of
        // all the ones which are case-sensitive.
        for i := 1; i < len(sub)-1; i++ {
                if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 {
                        contains = append(contains, string(sub[i].Rune))
                }
        }

        return
}

// StringMatcher is a matcher that matches a string in place of a regular expression.
type StringMatcher interface {
        Matches(s string) bool
}

// stringMatcherFromRegexp attempts to replace a common regexp with a string matcher.
// It returns nil if the regexp is not supported.
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
        clearBeginEndText(re)

        m := stringMatcherFromRegexpInternal(re)
        m = optimizeEqualStringMatchers(m, minEqualMultiStringMatcherMapThreshold)

        return m
}

func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
        clearCapture(re)

        switch re.Op {
        case syntax.OpBeginText:
                // Correctly handling the begin text operator inside a regex is tricky,
                // so in this case we fallback to the regex engine.
                return nil
        case syntax.OpEndText:
                // Correctly handling the end text operator inside a regex is tricky,
                // so in this case we fallback to the regex engine.
                return nil
        case syntax.OpPlus:
                if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
                        return nil
                }
                return &anyNonEmptyStringMatcher{
                        matchNL: re.Sub[0].Op == syntax.OpAnyChar,
                }
        case syntax.OpStar:
                if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
                        return nil
                }

                // If the newline is valid, than this matcher literally match any string (even empty).
                if re.Sub[0].Op == syntax.OpAnyChar {
                        return trueMatcher{}
                }

                // Any string is fine (including an empty one), as far as it doesn't contain any newline.
                return anyStringWithoutNewlineMatcher{}
        case syntax.OpQuest:
                // Only optimize for ".?".
                if len(re.Sub) != 1 || (re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL) {
                        return nil
                }

                return &zeroOrOneCharacterStringMatcher{
                        matchNL: re.Sub[0].Op == syntax.OpAnyChar,
                }
        case syntax.OpEmptyMatch:
                return emptyStringMatcher{}

        case syntax.OpLiteral:
                return &equalStringMatcher{
                        s:             string(re.Rune),
                        caseSensitive: !isCaseInsensitive(re),
                }
        case syntax.OpAlternate:
                or := make([]StringMatcher, 0, len(re.Sub))
                for _, sub := range re.Sub {
                        m := stringMatcherFromRegexpInternal(sub)
                        if m == nil {
                                return nil
                        }
                        or = append(or, m)
                }
                return orStringMatcher(or)
        case syntax.OpConcat:
                clearCapture(re.Sub...)

                if len(re.Sub) == 0 {
                        return emptyStringMatcher{}
                }
                if len(re.Sub) == 1 {
                        return stringMatcherFromRegexpInternal(re.Sub[0])
                }

                var left, right StringMatcher

                // Let's try to find if there's a first and last any matchers.
                if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar || re.Sub[0].Op == syntax.OpQuest {
                        left = stringMatcherFromRegexpInternal(re.Sub[0])
                        if left == nil {
                                return nil
                        }
                        re.Sub = re.Sub[1:]
                }
                if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar || re.Sub[len(re.Sub)-1].Op == syntax.OpQuest {
                        right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1])
                        if right == nil {
                                return nil
                        }
                        re.Sub = re.Sub[:len(re.Sub)-1]
                }

                matches, matchesCaseSensitive := findSetMatchesInternal(re, "")

                if len(matches) == 0 && len(re.Sub) == 2 {
                        // We have not find fixed set matches. We look for other known cases that
                        // we can optimize.
                        switch {
                        // Prefix is literal.
                        case right == nil && re.Sub[0].Op == syntax.OpLiteral:
                                right = stringMatcherFromRegexpInternal(re.Sub[1])
                                if right != nil {
                                        matches = []string{string(re.Sub[0].Rune)}
                                        matchesCaseSensitive = !isCaseInsensitive(re.Sub[0])
                                }

                        // Suffix is literal.
                        case left == nil && re.Sub[1].Op == syntax.OpLiteral:
                                left = stringMatcherFromRegexpInternal(re.Sub[0])
                                if left != nil {
                                        matches = []string{string(re.Sub[1].Rune)}
                                        matchesCaseSensitive = !isCaseInsensitive(re.Sub[1])
                                }
                        }
                }

                // Ensure we've found some literals to match (optionally with a left and/or right matcher).
                // If not, then this optimization doesn't trigger.
                if len(matches) == 0 {
                        return nil
                }

                // Use the right (and best) matcher based on what we've found.
                switch {
                // No left and right matchers (only fixed set matches).
                case left == nil && right == nil:
                        // if there's no any matchers on both side it's a concat of literals
                        or := make([]StringMatcher, 0, len(matches))
                        for _, match := range matches {
                                or = append(or, &equalStringMatcher{
                                        s:             match,
                                        caseSensitive: matchesCaseSensitive,
                                })
                        }
                        return orStringMatcher(or)

                // Right matcher with 1 fixed set match.
                case left == nil && len(matches) == 1:
                        return &literalPrefixStringMatcher{
                                prefix:              matches[0],
                                prefixCaseSensitive: matchesCaseSensitive,
                                right:               right,
                        }

                // Left matcher with 1 fixed set match.
                case right == nil && len(matches) == 1:
                        return &literalSuffixStringMatcher{
                                left:                left,
                                suffix:              matches[0],
                                suffixCaseSensitive: matchesCaseSensitive,
                        }

                // We found literals in the middle. We can trigger the fast path only if
                // the matches are case sensitive because containsStringMatcher doesn't
                // support case insensitive.
                case matchesCaseSensitive:
                        return &containsStringMatcher{
                                substrings: matches,
                                left:       left,
                                right:      right,
                        }
                }
        }
        return nil
}

// containsStringMatcher matches a string if it contains any of the substrings.
// If left and right are not nil, it's a contains operation where left and right must match.
// If left is nil, it's a hasPrefix operation and right must match.
// Finally, if right is nil it's a hasSuffix operation and left must match.
type containsStringMatcher struct {
        // The matcher that must match the left side. Can be nil.
        left StringMatcher

        // At least one of these strings must match in the "middle", between left and right matchers.
        substrings []string

        // The matcher that must match the right side. Can be nil.
        right StringMatcher
}

func (m *containsStringMatcher) Matches(s string) bool {
        for _, substr := range m.substrings {
                switch {
                case m.right != nil && m.left != nil:
                        searchStartPos := 0

                        for {
                                pos := strings.Index(s[searchStartPos:], substr)
                                if pos < 0 {
                                        break
                                }

                                // Since we started searching from searchStartPos, we have to add that offset
                                // to get the actual position of the substring inside the text.
                                pos += searchStartPos

                                // If both the left and right matchers match, then we can stop searching because
                                // we've found a match.
                                if m.left.Matches(s[:pos]) && m.right.Matches(s[pos+len(substr):]) {
                                        return true
                                }

                                // Continue searching for another occurrence of the substring inside the text.
                                searchStartPos = pos + 1
                        }
                case m.left != nil:
                        // If we have to check for characters on the left then we need to match a suffix.
                        if strings.HasSuffix(s, substr) && m.left.Matches(s[:len(s)-len(substr)]) {
                                return true
                        }
                case m.right != nil:
                        if strings.HasPrefix(s, substr) && m.right.Matches(s[len(substr):]) {
                                return true
                        }
                }
        }
        return false
}

// literalPrefixStringMatcher matches a string with the given literal prefix and right side matcher.
type literalPrefixStringMatcher struct {
        prefix              string
        prefixCaseSensitive bool

        // The matcher that must match the right side. Can be nil.
        right StringMatcher
}

func (m *literalPrefixStringMatcher) Matches(s string) bool {
        // Ensure the prefix matches.
        if m.prefixCaseSensitive && !strings.HasPrefix(s, m.prefix) {
                return false
        }
        if !m.prefixCaseSensitive && !hasPrefixCaseInsensitive(s, m.prefix) {
                return false
        }

        // Ensure the right side matches.
        return m.right.Matches(s[len(m.prefix):])
}

// literalSuffixStringMatcher matches a string with the given literal suffix and left side matcher.
type literalSuffixStringMatcher struct {
        // The matcher that must match the left side. Can be nil.
        left StringMatcher

        suffix              string
        suffixCaseSensitive bool
}

func (m *literalSuffixStringMatcher) Matches(s string) bool {
        // Ensure the suffix matches.
        if m.suffixCaseSensitive && !strings.HasSuffix(s, m.suffix) {
                return false
        }
        if !m.suffixCaseSensitive && !hasSuffixCaseInsensitive(s, m.suffix) {
                return false
        }

        // Ensure the left side matches.
        return m.left.Matches(s[:len(s)-len(m.suffix)])
}

// emptyStringMatcher matches an empty string.
type emptyStringMatcher struct{}

func (m emptyStringMatcher) Matches(s string) bool {
        return len(s) == 0
}

// orStringMatcher matches any of the sub-matchers.
type orStringMatcher []StringMatcher

func (m orStringMatcher) Matches(s string) bool {
        for _, matcher := range m {
                if matcher.Matches(s) {
                        return true
                }
        }
        return false
}

// equalStringMatcher matches a string exactly and support case insensitive.
type equalStringMatcher struct {
        s             string
        caseSensitive bool
}

func (m *equalStringMatcher) Matches(s string) bool {
        if m.caseSensitive {
                return m.s == s
        }
        return strings.EqualFold(m.s, s)
}

type multiStringMatcherBuilder interface {
        StringMatcher
        add(s string)
        setMatches() []string
}

func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize int) multiStringMatcherBuilder {
        // If the estimated size is low enough, it's faster to use a slice instead of a map.
        if estimatedSize < minEqualMultiStringMatcherMapThreshold {
                return &equalMultiStringSliceMatcher{caseSensitive: caseSensitive, values: make([]string, 0, estimatedSize)}
        }

        return &equalMultiStringMapMatcher{
                values:        make(map[string]struct{}, estimatedSize),
                caseSensitive: caseSensitive,
        }
}

// equalMultiStringSliceMatcher matches a string exactly against a slice of valid values.
type equalMultiStringSliceMatcher struct {
        values []string

        caseSensitive bool
}

func (m *equalMultiStringSliceMatcher) add(s string) {
        m.values = append(m.values, s)
}

func (m *equalMultiStringSliceMatcher) setMatches() []string {
        return m.values
}

func (m *equalMultiStringSliceMatcher) Matches(s string) bool {
        if m.caseSensitive {
                for _, v := range m.values {
                        if s == v {
                                return true
                        }
                }
        } else {
                for _, v := range m.values {
                        if strings.EqualFold(s, v) {
                                return true
                        }
                }
        }
        return false
}

// equalMultiStringMapMatcher matches a string exactly against a map of valid values.
type equalMultiStringMapMatcher struct {
        // values contains values to match a string against. If the matching is case insensitive,
        // the values here must be lowercase.
        values map[string]struct{}

        caseSensitive bool
}

func (m *equalMultiStringMapMatcher) add(s string) {
        if !m.caseSensitive {
                s = toNormalisedLower(s)
        }

        m.values[s] = struct{}{}
}

func (m *equalMultiStringMapMatcher) setMatches() []string {
        if len(m.values) >= maxSetMatches {
                return nil
        }

        matches := make([]string, 0, len(m.values))
        for s := range m.values {
                matches = append(matches, s)
        }
        return matches
}

func (m *equalMultiStringMapMatcher) Matches(s string) bool {
        if !m.caseSensitive {
                s = toNormalisedLower(s)
        }

        _, ok := m.values[s]
        return ok
}

// toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert
// it to lower case.
func toNormalisedLower(s string) string {
        var buf []byte
        for i := 0; i < len(s); i++ {
                c := s[i]
                if c >= utf8.RuneSelf {
                        return strings.Map(unicode.ToLower, norm.NFKD.String(s))
                }
                if 'A' <= c && c <= 'Z' {
                        if buf == nil {
                                buf = []byte(s)
                        }
                        buf[i] = c + 'a' - 'A'
                }
        }
        if buf == nil {
                return s
        }
        return yoloString(buf)
}

// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string
// (including an empty one) as far as it doesn't contain any newline character.
type anyStringWithoutNewlineMatcher struct{}

func (m anyStringWithoutNewlineMatcher) Matches(s string) bool {
        // We need to make sure it doesn't contain a newline. Since the newline is
        // an ASCII character, we can use strings.IndexByte().
        return strings.IndexByte(s, '\n') == -1
}

// anyNonEmptyStringMatcher is a stringMatcher which matches any non-empty string.
type anyNonEmptyStringMatcher struct {
        matchNL bool
}

func (m *anyNonEmptyStringMatcher) Matches(s string) bool {
        if m.matchNL {
                // It's OK if the string contains a newline so we just need to make
                // sure it's non-empty.
                return len(s) > 0
        }

        // We need to make sure it non-empty and doesn't contain a newline.
        // Since the newline is an ASCII character, we can use strings.IndexByte().
        return len(s) > 0 && strings.IndexByte(s, '\n') == -1
}

// zeroOrOneCharacterStringMatcher is a StringMatcher which matches zero or one occurrence
// of any character. The newline character is matches only if matchNL is set to true.
type zeroOrOneCharacterStringMatcher struct {
        matchNL bool
}

func (m *zeroOrOneCharacterStringMatcher) Matches(s string) bool {
        // If there's more than one rune in the string, then it can't match.
        if r, size := utf8.DecodeRuneInString(s); r == utf8.RuneError {
                // Size is 0 for empty strings, 1 for invalid rune.
                // Empty string matches, invalid rune matches if there isn't anything else.
                return size == len(s)
        } else if size < len(s) {
                return false
        }

        // No need to check for the newline if the string is empty or matching a newline is OK.
        if m.matchNL || len(s) == 0 {
                return true
        }

        return s[0] != '\n'
}

// trueMatcher is a stringMatcher which matches any string (always returns true).
type trueMatcher struct{}

func (m trueMatcher) Matches(_ string) bool {
        return true
}

// optimizeEqualStringMatchers optimize a specific case where all matchers are made by an
// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher). In
// this specific case, when we have many strings to match against we can use a map instead
// of iterating over the list of strings.
func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatcher {
        var (
                caseSensitive    bool
                caseSensitiveSet bool
                numValues        int
        )

        // Analyse the input StringMatcher to count the number of occurrences
        // and ensure all of them have the same case sensitivity.
        analyseCallback := func(matcher *equalStringMatcher) bool {
                // Ensure we don't have mixed case sensitivity.
                if caseSensitiveSet && caseSensitive != matcher.caseSensitive {
                        return false
                } else if !caseSensitiveSet {
                        caseSensitive = matcher.caseSensitive
                        caseSensitiveSet = true
                }

                numValues++
                return true
        }

        if !findEqualStringMatchers(input, analyseCallback) {
                return input
        }

        // If the number of values found is less than the threshold, then we should skip the optimization.
        if numValues < threshold {
                return input
        }

        // Parse again the input StringMatcher to extract all values and storing them.
        // We can skip the case sensitivity check because we've already checked it and
        // if the code reach this point then it means all matchers have the same case sensitivity.
        multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues)

        // Ignore the return value because we already iterated over the input StringMatcher
        // and it was all good.
        findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool {
                multiMatcher.add(matcher.s)
                return true
        })

        return multiMatcher
}

// findEqualStringMatchers analyze the input StringMatcher and calls the callback for each
// equalStringMatcher found. Returns true if and only if the input StringMatcher is *only*
// composed by an alternation of equalStringMatcher.
func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalStringMatcher) bool) bool {
        orInput, ok := input.(orStringMatcher)
        if !ok {
                return false
        }

        for _, m := range orInput {
                switch casted := m.(type) {
                case orStringMatcher:
                        if !findEqualStringMatchers(m, callback) {
                                return false
                        }

                case *equalStringMatcher:
                        if !callback(casted) {
                                return false
                        }

                default:
                        // It's not an equal string matcher, so we have to stop searching
                        // cause this optimization can't be applied.
                        return false
                }
        }

        return true
}

func hasPrefixCaseInsensitive(s, prefix string) bool {
        return len(s) >= len(prefix) && strings.EqualFold(s[0:len(prefix)], prefix)
}

func hasSuffixCaseInsensitive(s, suffix string) bool {
        return len(s) >= len(suffix) && strings.EqualFold(s[len(s)-len(suffix):], suffix)
}

func containsInOrder(s string, contains []string) bool {
        // Optimization for the case we only have to look for 1 substring.
        if len(contains) == 1 {
                return strings.Contains(s, contains[0])
        }

        return containsInOrderMulti(s, contains)
}

func containsInOrderMulti(s string, contains []string) bool {
        offset := 0

        for _, substr := range contains {
                at := strings.Index(s[offset:], substr)
                if at == -1 {
                        return false
                }

                offset += at + len(substr)
        }

        return true
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !stringlabels && !dedupelabels

package labels

import (
        "github.com/cespare/xxhash/v2"
)

// StableHash is a labels hashing implementation which is guaranteed to not change over time.
// This function should be used whenever labels hashing backward compatibility must be guaranteed.
func StableHash(ls Labels) uint64 {
        // Use xxhash.Sum64(b) for fast path as it's faster.
        b := make([]byte, 0, 1024)
        for i, v := range ls {
                if len(b)+len(v.Name)+len(v.Value)+2 >= cap(b) {
                        // If labels entry is 1KB+ do not allocate whole entry.
                        h := xxhash.New()
                        _, _ = h.Write(b)
                        for _, v := range ls[i:] {
                                _, _ = h.WriteString(v.Name)
                                _, _ = h.Write(seps)
                                _, _ = h.WriteString(v.Value)
                                _, _ = h.Write(seps)
                        }
                        return h.Sum64()
                }

                b = append(b, v.Name...)
                b = append(b, seps[0])
                b = append(b, v.Value...)
                b = append(b, seps[0])
        }
        return xxhash.Sum64(b)
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package labels

import (
        "bufio"
        "fmt"
        "os"
        "strings"
)

// Slice is a sortable slice of label sets.
type Slice []Labels

func (s Slice) Len() int           { return len(s) }
func (s Slice) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
func (s Slice) Less(i, j int) bool { return Compare(s[i], s[j]) < 0 }

// Selector holds constraints for matching against a label set.
type Selector []*Matcher

// Matches returns whether the labels satisfy all matchers.
func (s Selector) Matches(labels Labels) bool {
        for _, m := range s {
                if v := labels.Get(m.Name); !m.Matches(v) {
                        return false
                }
        }
        return true
}

// ReadLabels reads up to n label sets in a JSON formatted file fn. It is mostly useful
// to load testing data.
func ReadLabels(fn string, n int) ([]Labels, error) {
        f, err := os.Open(fn)
        if err != nil {
                return nil, err
        }
        defer f.Close()

        scanner := bufio.NewScanner(f)
        b := NewScratchBuilder(0)

        var mets []Labels
        hashes := map[uint64]struct{}{}
        i := 0

        for scanner.Scan() && i < n {
                b.Reset()

                r := strings.NewReplacer("\"", "", "{", "", "}", "")
                s := r.Replace(scanner.Text())

                labelChunks := strings.Split(s, ",")
                for _, labelChunk := range labelChunks {
                        split := strings.Split(labelChunk, ":")
                        b.Add(split[0], split[1])
                }
                // Order of the k/v labels matters, don't assume we'll always receive them already sorted.
                b.Sort()
                m := b.Labels()

                h := m.Hash()
                if _, ok := hashes[h]; ok {
                        continue
                }
                mets = append(mets, m)
                hashes[h] = struct{}{}
                i++
        }

        if i != n {
                return mets, fmt.Errorf("requested %d metrics but found %d", n, i)
        }
        return mets, nil
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package relabel

import (
        "crypto/md5"
        "encoding/binary"
        "fmt"
        "strconv"
        "strings"

        "github.com/grafana/regexp"
        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/labels"
)

var (
        relabelTarget = regexp.MustCompile(`^(?:(?:[a-zA-Z_]|\$(?:\{\w+\}|\w+))+\w*)+$`)

        DefaultRelabelConfig = Config{
                Action:      Replace,
                Separator:   ";",
                Regex:       MustNewRegexp("(.*)"),
                Replacement: "$1",
        }
)

// Action is the action to be performed on relabeling.
type Action string

const (
        // Replace performs a regex replacement.
        Replace Action = "replace"
        // Keep drops targets for which the input does not match the regex.
        Keep Action = "keep"
        // Drop drops targets for which the input does match the regex.
        Drop Action = "drop"
        // KeepEqual drops targets for which the input does not match the target.
        KeepEqual Action = "keepequal"
        // DropEqual drops targets for which the input does match the target.
        DropEqual Action = "dropequal"
        // HashMod sets a label to the modulus of a hash of labels.
        HashMod Action = "hashmod"
        // LabelMap copies labels to other labelnames based on a regex.
        LabelMap Action = "labelmap"
        // LabelDrop drops any label matching the regex.
        LabelDrop Action = "labeldrop"
        // LabelKeep drops any label not matching the regex.
        LabelKeep Action = "labelkeep"
        // Lowercase maps input letters to their lower case.
        Lowercase Action = "lowercase"
        // Uppercase maps input letters to their upper case.
        Uppercase Action = "uppercase"
)

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (a *Action) UnmarshalYAML(unmarshal func(interface{}) error) error {
        var s string
        if err := unmarshal(&s); err != nil {
                return err
        }
        switch act := Action(strings.ToLower(s)); act {
        case Replace, Keep, Drop, HashMod, LabelMap, LabelDrop, LabelKeep, Lowercase, Uppercase, KeepEqual, DropEqual:
                *a = act
                return nil
        }
        return fmt.Errorf("unknown relabel action %q", s)
}

// Config is the configuration for relabeling of target label sets.
type Config struct {
        // A list of labels from which values are taken and concatenated
        // with the configured separator in order.
        SourceLabels model.LabelNames `yaml:"source_labels,flow,omitempty"`
        // Separator is the string between concatenated values from the source labels.
        Separator string `yaml:"separator,omitempty"`
        // Regex against which the concatenation is matched.
        Regex Regexp `yaml:"regex,omitempty"`
        // Modulus to take of the hash of concatenated values from the source labels.
        Modulus uint64 `yaml:"modulus,omitempty"`
        // TargetLabel is the label to which the resulting string is written in a replacement.
        // Regexp interpolation is allowed for the replace action.
        TargetLabel string `yaml:"target_label,omitempty"`
        // Replacement is the regex replacement pattern to be used.
        Replacement string `yaml:"replacement,omitempty"`
        // Action is the action to be performed for the relabeling.
        Action Action `yaml:"action,omitempty"`
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *Config) UnmarshalYAML(unmarshal func(interface{}) error) error {
        *c = DefaultRelabelConfig
        type plain Config
        if err := unmarshal((*plain)(c)); err != nil {
                return err
        }
        if c.Regex.Regexp == nil {
                c.Regex = MustNewRegexp("")
        }
        return c.Validate()
}

func (c *Config) Validate() error {
        if c.Action == "" {
                return fmt.Errorf("relabel action cannot be empty")
        }
        if c.Modulus == 0 && c.Action == HashMod {
                return fmt.Errorf("relabel configuration for hashmod requires non-zero modulus")
        }
        if (c.Action == Replace || c.Action == HashMod || c.Action == Lowercase || c.Action == Uppercase || c.Action == KeepEqual || c.Action == DropEqual) && c.TargetLabel == "" {
                return fmt.Errorf("relabel configuration for %s action requires 'target_label' value", c.Action)
        }
        if c.Action == Replace && !strings.Contains(c.TargetLabel, "$") && !model.LabelName(c.TargetLabel).IsValid() {
                return fmt.Errorf("%q is invalid 'target_label' for %s action", c.TargetLabel, c.Action)
        }
        if c.Action == Replace && strings.Contains(c.TargetLabel, "$") && !relabelTarget.MatchString(c.TargetLabel) {
                return fmt.Errorf("%q is invalid 'target_label' for %s action", c.TargetLabel, c.Action)
        }
        if (c.Action == Lowercase || c.Action == Uppercase || c.Action == KeepEqual || c.Action == DropEqual) && !model.LabelName(c.TargetLabel).IsValid() {
                return fmt.Errorf("%q is invalid 'target_label' for %s action", c.TargetLabel, c.Action)
        }
        if (c.Action == Lowercase || c.Action == Uppercase || c.Action == KeepEqual || c.Action == DropEqual) && c.Replacement != DefaultRelabelConfig.Replacement {
                return fmt.Errorf("'replacement' can not be set for %s action", c.Action)
        }
        if c.Action == LabelMap && !relabelTarget.MatchString(c.Replacement) {
                return fmt.Errorf("%q is invalid 'replacement' for %s action", c.Replacement, c.Action)
        }
        if c.Action == HashMod && !model.LabelName(c.TargetLabel).IsValid() {
                return fmt.Errorf("%q is invalid 'target_label' for %s action", c.TargetLabel, c.Action)
        }

        if c.Action == DropEqual || c.Action == KeepEqual {
                if c.Regex != DefaultRelabelConfig.Regex ||
                        c.Modulus != DefaultRelabelConfig.Modulus ||
                        c.Separator != DefaultRelabelConfig.Separator ||
                        c.Replacement != DefaultRelabelConfig.Replacement {
                        return fmt.Errorf("%s action requires only 'source_labels' and `target_label`, and no other fields", c.Action)
                }
        }

        if c.Action == LabelDrop || c.Action == LabelKeep {
                if c.SourceLabels != nil ||
                        c.TargetLabel != DefaultRelabelConfig.TargetLabel ||
                        c.Modulus != DefaultRelabelConfig.Modulus ||
                        c.Separator != DefaultRelabelConfig.Separator ||
                        c.Replacement != DefaultRelabelConfig.Replacement {
                        return fmt.Errorf("%s action requires only 'regex', and no other fields", c.Action)
                }
        }

        return nil
}

// Regexp encapsulates a regexp.Regexp and makes it YAML marshalable.
type Regexp struct {
        *regexp.Regexp
}

// NewRegexp creates a new anchored Regexp and returns an error if the
// passed-in regular expression does not compile.
func NewRegexp(s string) (Regexp, error) {
        regex, err := regexp.Compile("^(?:" + s + ")$")
        return Regexp{Regexp: regex}, err
}

// MustNewRegexp works like NewRegexp, but panics if the regular expression does not compile.
func MustNewRegexp(s string) Regexp {
        re, err := NewRegexp(s)
        if err != nil {
                panic(err)
        }
        return re
}

// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (re *Regexp) UnmarshalYAML(unmarshal func(interface{}) error) error {
        var s string
        if err := unmarshal(&s); err != nil {
                return err
        }
        r, err := NewRegexp(s)
        if err != nil {
                return err
        }
        *re = r
        return nil
}

// MarshalYAML implements the yaml.Marshaler interface.
func (re Regexp) MarshalYAML() (interface{}, error) {
        if re.String() != "" {
                return re.String(), nil
        }
        return nil, nil
}

// IsZero implements the yaml.IsZeroer interface.
func (re Regexp) IsZero() bool {
        return re.Regexp == DefaultRelabelConfig.Regex.Regexp
}

// String returns the original string used to compile the regular expression.
func (re Regexp) String() string {
        str := re.Regexp.String()
        // Trim the anchor `^(?:` prefix and `)$` suffix.
        return str[4 : len(str)-2]
}

// Process returns a relabeled version of the given label set. The relabel configurations
// are applied in order of input.
// There are circumstances where Process will modify the input label.
// If you want to avoid issues with the input label set being modified, at the cost of
// higher memory usage, you can use lbls.Copy().
// If a label set is dropped, EmptyLabels and false is returned.
func Process(lbls labels.Labels, cfgs ...*Config) (ret labels.Labels, keep bool) {
        lb := labels.NewBuilder(lbls)
        if !ProcessBuilder(lb, cfgs...) {
                return labels.EmptyLabels(), false
        }
        return lb.Labels(), true
}

// ProcessBuilder is like Process, but the caller passes a labels.Builder
// containing the initial set of labels, which is mutated by the rules.
func ProcessBuilder(lb *labels.Builder, cfgs ...*Config) (keep bool) {
        for _, cfg := range cfgs {
                keep = relabel(cfg, lb)
                if !keep {
                        return false
                }
        }
        return true
}

func relabel(cfg *Config, lb *labels.Builder) (keep bool) {
        var va [16]string
        values := va[:0]
        if len(cfg.SourceLabels) > cap(values) {
                values = make([]string, 0, len(cfg.SourceLabels))
        }
        for _, ln := range cfg.SourceLabels {
                values = append(values, lb.Get(string(ln)))
        }
        val := strings.Join(values, cfg.Separator)

        switch cfg.Action {
        case Drop:
                if cfg.Regex.MatchString(val) {
                        return false
                }
        case Keep:
                if !cfg.Regex.MatchString(val) {
                        return false
                }
        case DropEqual:
                if lb.Get(cfg.TargetLabel) == val {
                        return false
                }
        case KeepEqual:
                if lb.Get(cfg.TargetLabel) != val {
                        return false
                }
        case Replace:
                indexes := cfg.Regex.FindStringSubmatchIndex(val)
                // If there is no match no replacement must take place.
                if indexes == nil {
                        break
                }
                target := model.LabelName(cfg.Regex.ExpandString([]byte{}, cfg.TargetLabel, val, indexes))
                if !target.IsValid() {
                        break
                }
                res := cfg.Regex.ExpandString([]byte{}, cfg.Replacement, val, indexes)
                if len(res) == 0 {
                        lb.Del(string(target))
                        break
                }
                lb.Set(string(target), string(res))
        case Lowercase:
                lb.Set(cfg.TargetLabel, strings.ToLower(val))
        case Uppercase:
                lb.Set(cfg.TargetLabel, strings.ToUpper(val))
        case HashMod:
                hash := md5.Sum([]byte(val))
                // Use only the last 8 bytes of the hash to give the same result as earlier versions of this code.
                mod := binary.BigEndian.Uint64(hash[8:]) % cfg.Modulus
                lb.Set(cfg.TargetLabel, strconv.FormatUint(mod, 10))
        case LabelMap:
                lb.Range(func(l labels.Label) {
                        if cfg.Regex.MatchString(l.Name) {
                                res := cfg.Regex.ReplaceAllString(l.Name, cfg.Replacement)
                                lb.Set(res, l.Value)
                        }
                })
        case LabelDrop:
                lb.Range(func(l labels.Label) {
                        if cfg.Regex.MatchString(l.Name) {
                                lb.Del(l.Name)
                        }
                })
        case LabelKeep:
                lb.Range(func(l labels.Label) {
                        if !cfg.Regex.MatchString(l.Name) {
                                lb.Del(l.Name)
                        }
                })
        default:
                panic(fmt.Errorf("relabel: unknown relabel action type %q", cfg.Action))
        }

        return true
}

// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package textparse

import (
        "mime"

        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
)

// Parser parses samples from a byte slice of samples in the official
// Prometheus and OpenMetrics text exposition formats.
type Parser interface {
        // Series returns the bytes of a series with a simple float64 as a
        // value, the timestamp if set, and the value of the current sample.
        Series() ([]byte, *int64, float64)

        // Histogram returns the bytes of a series with a sparse histogram as a
        // value, the timestamp if set, and the histogram in the current sample.
        // Depending on the parsed input, the function returns an (integer) Histogram
        // or a FloatHistogram, with the respective other return value being nil.
        Histogram() ([]byte, *int64, *histogram.Histogram, *histogram.FloatHistogram)

        // Help returns the metric name and help text in the current entry.
        // Must only be called after Next returned a help entry.
        // The returned byte slices become invalid after the next call to Next.
        Help() ([]byte, []byte)

        // Type returns the metric name and type in the current entry.
        // Must only be called after Next returned a type entry.
        // The returned byte slices become invalid after the next call to Next.
        Type() ([]byte, model.MetricType)

        // Unit returns the metric name and unit in the current entry.
        // Must only be called after Next returned a unit entry.
        // The returned byte slices become invalid after the next call to Next.
        Unit() ([]byte, []byte)

        // Comment returns the text of the current comment.
        // Must only be called after Next returned a comment entry.
        // The returned byte slice becomes invalid after the next call to Next.
        Comment() []byte

        // Metric writes the labels of the current sample into the passed labels.
        // It returns the string from which the metric was parsed.
        Metric(l *labels.Labels) string

        // Exemplar writes the exemplar of the current sample into the passed
        // exemplar. It can be called repeatedly to retrieve multiple exemplars
        // for the same sample. It returns false once all exemplars are
        // retrieved (including the case where no exemplars exist at all).
        Exemplar(l *exemplar.Exemplar) bool

        // CreatedTimestamp returns the created timestamp (in milliseconds) for the
        // current sample. It returns nil if it is unknown e.g. if it wasn't set,
        // if the scrape protocol or metric type does not support created timestamps.
        CreatedTimestamp() *int64

        // Next advances the parser to the next sample.
        // It returns (EntryInvalid, io.EOF) if no samples were read.
        Next() (Entry, error)
}

// New returns a new parser of the byte slice.
//
// This function always returns a valid parser, but might additionally
// return an error if the content type cannot be parsed.
func New(b []byte, contentType string, parseClassicHistograms bool, st *labels.SymbolTable) (Parser, error) {
        if contentType == "" {
                return NewPromParser(b, st), nil
        }

        mediaType, _, err := mime.ParseMediaType(contentType)
        if err != nil {
                return NewPromParser(b, st), err
        }
        switch mediaType {
        case "application/openmetrics-text":
                return NewOpenMetricsParser(b, st), nil
        case "application/vnd.google.protobuf":
                return NewProtobufParser(b, parseClassicHistograms, st), nil
        default:
                return NewPromParser(b, st), nil
        }
}

// Entry represents the type of a parsed entry.
type Entry int

const (
        EntryInvalid   Entry = -1
        EntryType      Entry = 0
        EntryHelp      Entry = 1
        EntrySeries    Entry = 2 // A series with a simple float64 as value.
        EntryComment   Entry = 3
        EntryUnit      Entry = 4
        EntryHistogram Entry = 5 // A series with a native histogram as a value.
)

// Code generated by golex. DO NOT EDIT.

// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package textparse

import (
        "fmt"
)

// Lex is called by the parser generated by "go tool yacc" to obtain each
// token. The method is opened before the matching rules block and closed at
// the end of the file.
func (l *openMetricsLexer) Lex() token {
        if l.i >= len(l.b) {
                return tEOF
        }
        c := l.b[l.i]
        l.start = l.i

yystate0:

        switch yyt := l.state; yyt {
        default:
                panic(fmt.Errorf(`invalid start condition %d`, yyt))
        case 0: // start condition: INITIAL
                goto yystart1
        case 1: // start condition: sComment
                goto yystart6
        case 2: // start condition: sMeta1
                goto yystart26
        case 3: // start condition: sMeta2
                goto yystart31
        case 4: // start condition: sLabels
                goto yystart34
        case 5: // start condition: sLValue
                goto yystart42
        case 6: // start condition: sValue
                goto yystart46
        case 7: // start condition: sTimestamp
                goto yystart50
        case 8: // start condition: sExemplar
                goto yystart57
        case 9: // start condition: sEValue
                goto yystart62
        case 10: // start condition: sETimestamp
                goto yystart68
        }

yystate1:
        c = l.next()
yystart1:
        switch {
        default:
                goto yyabort
        case c == '#':
                goto yystate2
        case c == ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate4
        case c == '{':
                goto yystate5
        }

yystate2:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == ' ':
                goto yystate3
        }

yystate3:
        c = l.next()
        goto yyrule1

yystate4:
        c = l.next()
        switch {
        default:
                goto yyrule9
        case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate4
        }

yystate5:
        c = l.next()
        goto yyrule11

yystate6:
        c = l.next()
yystart6:
        switch {
        default:
                goto yyabort
        case c == 'E':
                goto yystate7
        case c == 'H':
                goto yystate11
        case c == 'T':
                goto yystate16
        case c == 'U':
                goto yystate21
        }

yystate7:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'O':
                goto yystate8
        }

yystate8:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'F':
                goto yystate9
        }

yystate9:
        c = l.next()
        switch {
        default:
                goto yyrule5
        case c == '\n':
                goto yystate10
        }

yystate10:
        c = l.next()
        goto yyrule5

yystate11:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'E':
                goto yystate12
        }

yystate12:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'L':
                goto yystate13
        }

yystate13:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'P':
                goto yystate14
        }

yystate14:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == ' ':
                goto yystate15
        }

yystate15:
        c = l.next()
        goto yyrule2

yystate16:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'Y':
                goto yystate17
        }

yystate17:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'P':
                goto yystate18
        }

yystate18:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'E':
                goto yystate19
        }

yystate19:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == ' ':
                goto yystate20
        }

yystate20:
        c = l.next()
        goto yyrule3

yystate21:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'N':
                goto yystate22
        }

yystate22:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'I':
                goto yystate23
        }

yystate23:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'T':
                goto yystate24
        }

yystate24:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == ' ':
                goto yystate25
        }

yystate25:
        c = l.next()
        goto yyrule4

yystate26:
        c = l.next()
yystart26:
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate27
        case c == ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate30
        }

yystate27:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate28
        case c == '\\':
                goto yystate29
        case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
                goto yystate27
        }

yystate28:
        c = l.next()
        goto yyrule6

yystate29:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
                goto yystate27
        }

yystate30:
        c = l.next()
        switch {
        default:
                goto yyrule7
        case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate30
        }

yystate31:
        c = l.next()
yystart31:
        switch {
        default:
                goto yyabort
        case c == ' ':
                goto yystate32
        }

yystate32:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '\n':
                goto yystate33
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
                goto yystate32
        }

yystate33:
        c = l.next()
        goto yyrule8

yystate34:
        c = l.next()
yystart34:
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate35
        case c == ',':
                goto yystate38
        case c == '=':
                goto yystate39
        case c == '}':
                goto yystate41
        case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate40
        }

yystate35:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate36
        case c == '\\':
                goto yystate37
        case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
                goto yystate35
        }

yystate36:
        c = l.next()
        goto yyrule13

yystate37:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
                goto yystate35
        }

yystate38:
        c = l.next()
        goto yyrule16

yystate39:
        c = l.next()
        goto yyrule15

yystate40:
        c = l.next()
        switch {
        default:
                goto yyrule12
        case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate40
        }

yystate41:
        c = l.next()
        goto yyrule14

yystate42:
        c = l.next()
yystart42:
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate43
        }

yystate43:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate44
        case c == '\\':
                goto yystate45
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
                goto yystate43
        }

yystate44:
        c = l.next()
        goto yyrule17

yystate45:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
                goto yystate43
        }

yystate46:
        c = l.next()
yystart46:
        switch {
        default:
                goto yyabort
        case c == ' ':
                goto yystate47
        case c == '{':
                goto yystate49
        }

yystate47:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate48
        }

yystate48:
        c = l.next()
        switch {
        default:
                goto yyrule18
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate48
        }

yystate49:
        c = l.next()
        goto yyrule10

yystate50:
        c = l.next()
yystart50:
        switch {
        default:
                goto yyabort
        case c == ' ':
                goto yystate52
        case c == '\n':
                goto yystate51
        }

yystate51:
        c = l.next()
        goto yyrule20

yystate52:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '#':
                goto yystate54
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '\x1f' || c == '!' || c == '"' || c >= '$' && c <= 'ÿ':
                goto yystate53
        }

yystate53:
        c = l.next()
        switch {
        default:
                goto yyrule19
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate53
        }

yystate54:
        c = l.next()
        switch {
        default:
                goto yyrule19
        case c == ' ':
                goto yystate55
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate53
        }

yystate55:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '{':
                goto yystate56
        }

yystate56:
        c = l.next()
        goto yyrule21

yystate57:
        c = l.next()
yystart57:
        switch {
        default:
                goto yyabort
        case c == ',':
                goto yystate58
        case c == '=':
                goto yystate59
        case c == '}':
                goto yystate61
        case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate60
        }

yystate58:
        c = l.next()
        goto yyrule26

yystate59:
        c = l.next()
        goto yyrule24

yystate60:
        c = l.next()
        switch {
        default:
                goto yyrule22
        case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate60
        }

yystate61:
        c = l.next()
        goto yyrule23

yystate62:
        c = l.next()
yystart62:
        switch {
        default:
                goto yyabort
        case c == ' ':
                goto yystate63
        case c == '"':
                goto yystate65
        }

yystate63:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate64
        }

yystate64:
        c = l.next()
        switch {
        default:
                goto yyrule27
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate64
        }

yystate65:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate66
        case c == '\\':
                goto yystate67
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
                goto yystate65
        }

yystate66:
        c = l.next()
        goto yyrule25

yystate67:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
                goto yystate65
        }

yystate68:
        c = l.next()
yystart68:
        switch {
        default:
                goto yyabort
        case c == ' ':
                goto yystate70
        case c == '\n':
                goto yystate69
        }

yystate69:
        c = l.next()
        goto yyrule29

yystate70:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate71
        }

yystate71:
        c = l.next()
        switch {
        default:
                goto yyrule28
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate71
        }

yyrule1: // #{S}
        {
                l.state = sComment
                goto yystate0
        }
yyrule2: // HELP{S}
        {
                l.state = sMeta1
                return tHelp
                goto yystate0
        }
yyrule3: // TYPE{S}
        {
                l.state = sMeta1
                return tType
                goto yystate0
        }
yyrule4: // UNIT{S}
        {
                l.state = sMeta1
                return tUnit
                goto yystate0
        }
yyrule5: // "EOF"\n?
        {
                l.state = sInit
                return tEOFWord
                goto yystate0
        }
yyrule6: // \"(\\.|[^\\"])*\"
        {
                l.state = sMeta2
                return tMName
                goto yystate0
        }
yyrule7: // {M}({M}|{D})*
        {
                l.state = sMeta2
                return tMName
                goto yystate0
        }
yyrule8: // {S}{C}*\n
        {
                l.state = sInit
                return tText
                goto yystate0
        }
yyrule9: // {M}({M}|{D})*
        {
                l.state = sValue
                return tMName
                goto yystate0
        }
yyrule10: // \{
        {
                l.state = sLabels
                return tBraceOpen
                goto yystate0
        }
yyrule11: // \{
        {
                l.state = sLabels
                return tBraceOpen
                goto yystate0
        }
yyrule12: // {L}({L}|{D})*
        {
                return tLName
        }
yyrule13: // \"(\\.|[^\\"])*\"
        {
                l.state = sLabels
                return tQString
                goto yystate0
        }
yyrule14: // \}
        {
                l.state = sValue
                return tBraceClose
                goto yystate0
        }
yyrule15: // =
        {
                l.state = sLValue
                return tEqual
                goto yystate0
        }
yyrule16: // ,
        {
                return tComma
        }
yyrule17: // \"(\\.|[^\\"\n])*\"
        {
                l.state = sLabels
                return tLValue
                goto yystate0
        }
yyrule18: // {S}[^ \n]+
        {
                l.state = sTimestamp
                return tValue
                goto yystate0
        }
yyrule19: // {S}[^ \n]+
        {
                return tTimestamp
        }
yyrule20: // \n
        {
                l.state = sInit
                return tLinebreak
                goto yystate0
        }
yyrule21: // {S}#{S}\{
        {
                l.state = sExemplar
                return tComment
                goto yystate0
        }
yyrule22: // {L}({L}|{D})*
        {
                return tLName
        }
yyrule23: // \}
        {
                l.state = sEValue
                return tBraceClose
                goto yystate0
        }
yyrule24: // =
        {
                l.state = sEValue
                return tEqual
                goto yystate0
        }
yyrule25: // \"(\\.|[^\\"\n])*\"
        {
                l.state = sExemplar
                return tLValue
                goto yystate0
        }
yyrule26: // ,
        {
                return tComma
        }
yyrule27: // {S}[^ \n]+
        {
                l.state = sETimestamp
                return tValue
                goto yystate0
        }
yyrule28: // {S}[^ \n]+
        {
                return tTimestamp
        }
yyrule29: // \n
        if true { // avoid go vet determining the below panic will not be reached
                l.state = sInit
                return tLinebreak
                goto yystate0
        }
        panic("unreachable")

yyabort: // no lexem recognized
        // silence unused label errors for build and satisfy go vet reachability analysis
        {
                if false {
                        goto yyabort
                }
                if false {
                        goto yystate0
                }
                if false {
                        goto yystate1
                }
                if false {
                        goto yystate6
                }
                if false {
                        goto yystate26
                }
                if false {
                        goto yystate31
                }
                if false {
                        goto yystate34
                }
                if false {
                        goto yystate42
                }
                if false {
                        goto yystate46
                }
                if false {
                        goto yystate50
                }
                if false {
                        goto yystate57
                }
                if false {
                        goto yystate62
                }
                if false {
                        goto yystate68
                }
        }

        return tInvalid
}

// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:generate go get -u modernc.org/golex
//go:generate golex -o=openmetricslex.l.go openmetricslex.l

package textparse

import (
        "errors"
        "fmt"
        "io"
        "math"
        "strings"
        "unicode/utf8"

        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/value"
)

type openMetricsLexer struct {
        b     []byte
        i     int
        start int
        err   error
        state int
}

// buf returns the buffer of the current token.
func (l *openMetricsLexer) buf() []byte {
        return l.b[l.start:l.i]
}

// next advances the openMetricsLexer to the next character.
func (l *openMetricsLexer) next() byte {
        l.i++
        if l.i >= len(l.b) {
                l.err = io.EOF
                return byte(tEOF)
        }
        // Lex struggles with null bytes. If we are in a label value or help string, where
        // they are allowed, consume them here immediately.
        for l.b[l.i] == 0 && (l.state == sLValue || l.state == sMeta2 || l.state == sComment) {
                l.i++
                if l.i >= len(l.b) {
                        l.err = io.EOF
                        return byte(tEOF)
                }
        }
        return l.b[l.i]
}

func (l *openMetricsLexer) Error(es string) {
        l.err = errors.New(es)
}

// OpenMetricsParser parses samples from a byte slice of samples in the official
// OpenMetrics text exposition format.
// This is based on the working draft https://docs.google.com/document/u/1/d/1KwV0mAXwwbvvifBvDKH_LU1YjyXE_wxCkHNoCGq1GX0/edit
type OpenMetricsParser struct {
        l       *openMetricsLexer
        builder labels.ScratchBuilder
        series  []byte
        text    []byte
        mtype   model.MetricType
        val     float64
        ts      int64
        hasTS   bool
        start   int
        // offsets is a list of offsets into series that describe the positions
        // of the metric name and label names and values for this series.
        // p.offsets[0] is the start character of the metric name.
        // p.offsets[1] is the end of the metric name.
        // Subsequently, p.offsets is a pair of pair of offsets for the positions
        // of the label name and value start and end characters.
        offsets []int

        eOffsets      []int
        exemplar      []byte
        exemplarVal   float64
        exemplarTs    int64
        hasExemplarTs bool
}

// NewOpenMetricsParser returns a new parser of the byte slice.
func NewOpenMetricsParser(b []byte, st *labels.SymbolTable) Parser {
        return &OpenMetricsParser{
                l:       &openMetricsLexer{b: b},
                builder: labels.NewScratchBuilderWithSymbolTable(st, 16),
        }
}

// Series returns the bytes of the series, the timestamp if set, and the value
// of the current sample.
func (p *OpenMetricsParser) Series() ([]byte, *int64, float64) {
        if p.hasTS {
                ts := p.ts
                return p.series, &ts, p.val
        }
        return p.series, nil, p.val
}

// Histogram returns (nil, nil, nil, nil) for now because OpenMetrics does not
// support sparse histograms yet.
func (p *OpenMetricsParser) Histogram() ([]byte, *int64, *histogram.Histogram, *histogram.FloatHistogram) {
        return nil, nil, nil, nil
}

// Help returns the metric name and help text in the current entry.
// Must only be called after Next returned a help entry.
// The returned byte slices become invalid after the next call to Next.
func (p *OpenMetricsParser) Help() ([]byte, []byte) {
        m := p.l.b[p.offsets[0]:p.offsets[1]]

        // Replacer causes allocations. Replace only when necessary.
        if strings.IndexByte(yoloString(p.text), byte('\\')) >= 0 {
                // OpenMetrics always uses the Prometheus format label value escaping.
                return m, []byte(lvalReplacer.Replace(string(p.text)))
        }
        return m, p.text
}

// Type returns the metric name and type in the current entry.
// Must only be called after Next returned a type entry.
// The returned byte slices become invalid after the next call to Next.
func (p *OpenMetricsParser) Type() ([]byte, model.MetricType) {
        return p.l.b[p.offsets[0]:p.offsets[1]], p.mtype
}

// Unit returns the metric name and unit in the current entry.
// Must only be called after Next returned a unit entry.
// The returned byte slices become invalid after the next call to Next.
func (p *OpenMetricsParser) Unit() ([]byte, []byte) {
        return p.l.b[p.offsets[0]:p.offsets[1]], p.text
}

// Comment returns the text of the current comment.
// Must only be called after Next returned a comment entry.
// The returned byte slice becomes invalid after the next call to Next.
func (p *OpenMetricsParser) Comment() []byte {
        return p.text
}

// Metric writes the labels of the current sample into the passed labels.
// It returns the string from which the metric was parsed.
func (p *OpenMetricsParser) Metric(l *labels.Labels) string {
        // Copy the buffer to a string: this is only necessary for the return value.
        s := string(p.series)

        p.builder.Reset()
        metricName := unreplace(s[p.offsets[0]-p.start : p.offsets[1]-p.start])
        p.builder.Add(labels.MetricName, metricName)

        for i := 2; i < len(p.offsets); i += 4 {
                a := p.offsets[i] - p.start
                b := p.offsets[i+1] - p.start
                label := unreplace(s[a:b])
                c := p.offsets[i+2] - p.start
                d := p.offsets[i+3] - p.start
                value := unreplace(s[c:d])

                p.builder.Add(label, value)
        }

        p.builder.Sort()
        *l = p.builder.Labels()

        return s
}

// Exemplar writes the exemplar of the current sample into the passed exemplar.
// It returns whether an exemplar exists. As OpenMetrics only ever has one
// exemplar per sample, every call after the first (for the same sample) will
// always return false.
func (p *OpenMetricsParser) Exemplar(e *exemplar.Exemplar) bool {
        if len(p.exemplar) == 0 {
                return false
        }

        // Allocate the full immutable string immediately, so we just
        // have to create references on it below.
        s := string(p.exemplar)

        e.Value = p.exemplarVal
        if p.hasExemplarTs {
                e.HasTs = true
                e.Ts = p.exemplarTs
        }

        p.builder.Reset()
        for i := 0; i < len(p.eOffsets); i += 4 {
                a := p.eOffsets[i] - p.start
                b := p.eOffsets[i+1] - p.start
                c := p.eOffsets[i+2] - p.start
                d := p.eOffsets[i+3] - p.start

                p.builder.Add(s[a:b], s[c:d])
        }

        p.builder.Sort()
        e.Labels = p.builder.Labels()

        // Wipe exemplar so that future calls return false.
        p.exemplar = p.exemplar[:0]
        return true
}

// CreatedTimestamp returns nil as it's not implemented yet.
// TODO(bwplotka): https://github.com/prometheus/prometheus/issues/12980
func (p *OpenMetricsParser) CreatedTimestamp() *int64 {
        return nil
}

// nextToken returns the next token from the openMetricsLexer.
func (p *OpenMetricsParser) nextToken() token {
        tok := p.l.Lex()
        return tok
}

func (p *OpenMetricsParser) parseError(exp string, got token) error {
        e := p.l.i + 1
        if len(p.l.b) < e {
                e = len(p.l.b)
        }
        return fmt.Errorf("%s, got %q (%q) while parsing: %q", exp, p.l.b[p.l.start:e], got, p.l.b[p.start:e])
}

// Next advances the parser to the next sample.
// It returns (EntryInvalid, io.EOF) if no samples were read.
func (p *OpenMetricsParser) Next() (Entry, error) {
        var err error

        p.start = p.l.i
        p.offsets = p.offsets[:0]
        p.eOffsets = p.eOffsets[:0]
        p.exemplar = p.exemplar[:0]
        p.exemplarVal = 0
        p.hasExemplarTs = false

        switch t := p.nextToken(); t {
        case tEOFWord:
                if t := p.nextToken(); t != tEOF {
                        return EntryInvalid, errors.New("unexpected data after # EOF")
                }
                return EntryInvalid, io.EOF
        case tEOF:
                return EntryInvalid, errors.New("data does not end with # EOF")
        case tHelp, tType, tUnit:
                switch t2 := p.nextToken(); t2 {
                case tMName:
                        mStart := p.l.start
                        mEnd := p.l.i
                        if p.l.b[mStart] == '"' && p.l.b[mEnd-1] == '"' {
                                mStart++
                                mEnd--
                        }
                        p.offsets = append(p.offsets, mStart, mEnd)
                default:
                        return EntryInvalid, p.parseError("expected metric name after "+t.String(), t2)
                }
                switch t2 := p.nextToken(); t2 {
                case tText:
                        if len(p.l.buf()) > 1 {
                                p.text = p.l.buf()[1 : len(p.l.buf())-1]
                        } else {
                                p.text = []byte{}
                        }
                default:
                        return EntryInvalid, fmt.Errorf("expected text in %s", t.String())
                }
                switch t {
                case tType:
                        switch s := yoloString(p.text); s {
                        case "counter":
                                p.mtype = model.MetricTypeCounter
                        case "gauge":
                                p.mtype = model.MetricTypeGauge
                        case "histogram":
                                p.mtype = model.MetricTypeHistogram
                        case "gaugehistogram":
                                p.mtype = model.MetricTypeGaugeHistogram
                        case "summary":
                                p.mtype = model.MetricTypeSummary
                        case "info":
                                p.mtype = model.MetricTypeInfo
                        case "stateset":
                                p.mtype = model.MetricTypeStateset
                        case "unknown":
                                p.mtype = model.MetricTypeUnknown
                        default:
                                return EntryInvalid, fmt.Errorf("invalid metric type %q", s)
                        }
                case tHelp:
                        if !utf8.Valid(p.text) {
                                return EntryInvalid, fmt.Errorf("help text %q is not a valid utf8 string", p.text)
                        }
                }
                switch t {
                case tHelp:
                        return EntryHelp, nil
                case tType:
                        return EntryType, nil
                case tUnit:
                        m := yoloString(p.l.b[p.offsets[0]:p.offsets[1]])
                        u := yoloString(p.text)
                        if len(u) > 0 {
                                if !strings.HasSuffix(m, u) || len(m) < len(u)+1 || p.l.b[p.offsets[1]-len(u)-1] != '_' {
                                        return EntryInvalid, fmt.Errorf("unit %q not a suffix of metric %q", u, m)
                                }
                        }
                        return EntryUnit, nil
                }

        case tBraceOpen:
                // We found a brace, so make room for the eventual metric name. If these
                // values aren't updated, then the metric name was not set inside the
                // braces and we can return an error.
                if len(p.offsets) == 0 {
                        p.offsets = []int{-1, -1}
                }
                if p.offsets, err = p.parseLVals(p.offsets, false); err != nil {
                        return EntryInvalid, err
                }

                p.series = p.l.b[p.start:p.l.i]
                return p.parseMetricSuffix(p.nextToken())
        case tMName:
                p.offsets = append(p.offsets, p.start, p.l.i)
                p.series = p.l.b[p.start:p.l.i]

                t2 := p.nextToken()
                if t2 == tBraceOpen {
                        p.offsets, err = p.parseLVals(p.offsets, false)
                        if err != nil {
                                return EntryInvalid, err
                        }
                        p.series = p.l.b[p.start:p.l.i]
                        t2 = p.nextToken()
                }
                return p.parseMetricSuffix(t2)

        default:
                err = p.parseError("expected a valid start token", t)
        }
        return EntryInvalid, err
}

func (p *OpenMetricsParser) parseComment() error {
        var err error
        // Parse the labels.
        p.eOffsets, err = p.parseLVals(p.eOffsets, true)
        if err != nil {
                return err
        }
        p.exemplar = p.l.b[p.start:p.l.i]

        // Get the value.
        p.exemplarVal, err = p.getFloatValue(p.nextToken(), "exemplar labels")
        if err != nil {
                return err
        }

        // Read the optional timestamp.
        p.hasExemplarTs = false
        switch t2 := p.nextToken(); t2 {
        case tEOF:
                return errors.New("data does not end with # EOF")
        case tLinebreak:
                break
        case tTimestamp:
                p.hasExemplarTs = true
                var ts float64
                // A float is enough to hold what we need for millisecond resolution.
                if ts, err = parseFloat(yoloString(p.l.buf()[1:])); err != nil {
                        return fmt.Errorf("%w while parsing: %q", err, p.l.b[p.start:p.l.i])
                }
                if math.IsNaN(ts) || math.IsInf(ts, 0) {
                        return fmt.Errorf("invalid exemplar timestamp %f", ts)
                }
                p.exemplarTs = int64(ts * 1000)
                switch t3 := p.nextToken(); t3 {
                case tLinebreak:
                default:
                        return p.parseError("expected next entry after exemplar timestamp", t3)
                }
        default:
                return p.parseError("expected timestamp or comment", t2)
        }
        return nil
}

func (p *OpenMetricsParser) parseLVals(offsets []int, isExemplar bool) ([]int, error) {
        t := p.nextToken()
        for {
                curTStart := p.l.start
                curTI := p.l.i
                switch t {
                case tBraceClose:
                        return offsets, nil
                case tLName:
                case tQString:
                default:
                        return nil, p.parseError("expected label name", t)
                }

                t = p.nextToken()
                // A quoted string followed by a comma or brace is a metric name. Set the
                // offsets and continue processing. If this is an exemplar, this format
                // is not allowed.
                if t == tComma || t == tBraceClose {
                        if isExemplar {
                                return nil, p.parseError("expected label name", t)
                        }
                        if offsets[0] != -1 || offsets[1] != -1 {
                                return nil, fmt.Errorf("metric name already set while parsing: %q", p.l.b[p.start:p.l.i])
                        }
                        offsets[0] = curTStart + 1
                        offsets[1] = curTI - 1
                        if t == tBraceClose {
                                return offsets, nil
                        }
                        t = p.nextToken()
                        continue
                }
                // We have a label name, and it might be quoted.
                if p.l.b[curTStart] == '"' {
                        curTStart++
                        curTI--
                }
                offsets = append(offsets, curTStart, curTI)

                if t != tEqual {
                        return nil, p.parseError("expected equal", t)
                }
                if t := p.nextToken(); t != tLValue {
                        return nil, p.parseError("expected label value", t)
                }
                if !utf8.Valid(p.l.buf()) {
                        return nil, fmt.Errorf("invalid UTF-8 label value: %q", p.l.buf())
                }

                // The openMetricsLexer ensures the value string is quoted. Strip first
                // and last character.
                offsets = append(offsets, p.l.start+1, p.l.i-1)

                // Free trailing commas are allowed.
                t = p.nextToken()
                if t == tComma {
                        t = p.nextToken()
                } else if t != tBraceClose {
                        return nil, p.parseError("expected comma or brace close", t)
                }
        }
}

// parseMetricSuffix parses the end of the line after the metric name and
// labels. It starts parsing with the provided token.
func (p *OpenMetricsParser) parseMetricSuffix(t token) (Entry, error) {
        if p.offsets[0] == -1 {
                return EntryInvalid, fmt.Errorf("metric name not set while parsing: %q", p.l.b[p.start:p.l.i])
        }

        var err error
        p.val, err = p.getFloatValue(t, "metric")
        if err != nil {
                return EntryInvalid, err
        }

        p.hasTS = false
        switch t2 := p.nextToken(); t2 {
        case tEOF:
                return EntryInvalid, errors.New("data does not end with # EOF")
        case tLinebreak:
                break
        case tComment:
                if err := p.parseComment(); err != nil {
                        return EntryInvalid, err
                }
        case tTimestamp:
                p.hasTS = true
                var ts float64
                // A float is enough to hold what we need for millisecond resolution.
                if ts, err = parseFloat(yoloString(p.l.buf()[1:])); err != nil {
                        return EntryInvalid, fmt.Errorf("%w while parsing: %q", err, p.l.b[p.start:p.l.i])
                }
                if math.IsNaN(ts) || math.IsInf(ts, 0) {
                        return EntryInvalid, fmt.Errorf("invalid timestamp %f", ts)
                }
                p.ts = int64(ts * 1000)
                switch t3 := p.nextToken(); t3 {
                case tLinebreak:
                case tComment:
                        if err := p.parseComment(); err != nil {
                                return EntryInvalid, err
                        }
                default:
                        return EntryInvalid, p.parseError("expected next entry after timestamp", t3)
                }
        }
        return EntrySeries, nil
}

func (p *OpenMetricsParser) getFloatValue(t token, after string) (float64, error) {
        if t != tValue {
                return 0, p.parseError(fmt.Sprintf("expected value after %v", after), t)
        }
        val, err := parseFloat(yoloString(p.l.buf()[1:]))
        if err != nil {
                return 0, fmt.Errorf("%w while parsing: %q", err, p.l.b[p.start:p.l.i])
        }
        // Ensure canonical NaN value.
        if math.IsNaN(p.exemplarVal) {
                val = math.Float64frombits(value.NormalNaN)
        }
        return val, nil
}

// Code generated by golex. DO NOT EDIT.

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package textparse

import (
        "fmt"
)

const (
        sInit = iota
        sComment
        sMeta1
        sMeta2
        sLabels
        sLValue
        sValue
        sTimestamp
        sExemplar
        sEValue
        sETimestamp
)

// Lex is called by the parser generated by "go tool yacc" to obtain each
// token. The method is opened before the matching rules block and closed at
// the end of the file.
func (l *promlexer) Lex() token {
        if l.i >= len(l.b) {
                return tEOF
        }
        c := l.b[l.i]
        l.start = l.i

yystate0:

        switch yyt := l.state; yyt {
        default:
                panic(fmt.Errorf(`invalid start condition %d`, yyt))
        case 0: // start condition: INITIAL
                goto yystart1
        case 1: // start condition: sComment
                goto yystart9
        case 2: // start condition: sMeta1
                goto yystart20
        case 3: // start condition: sMeta2
                goto yystart25
        case 4: // start condition: sLabels
                goto yystart28
        case 5: // start condition: sLValue
                goto yystart36
        case 6: // start condition: sValue
                goto yystart40
        case 7: // start condition: sTimestamp
                goto yystart43
        }

yystate1:
        c = l.next()
yystart1:
        switch {
        default:
                goto yyabort
        case c == '#':
                goto yystate5
        case c == ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate7
        case c == '\n':
                goto yystate4
        case c == '\t' || c == ' ':
                goto yystate3
        case c == '\x00':
                goto yystate2
        case c == '{':
                goto yystate8
        }

yystate2:
        c = l.next()
        goto yyrule1

yystate3:
        c = l.next()
        switch {
        default:
                goto yyrule3
        case c == '\t' || c == ' ':
                goto yystate3
        }

yystate4:
        c = l.next()
        goto yyrule2

yystate5:
        c = l.next()
        switch {
        default:
                goto yyrule5
        case c == '\t' || c == ' ':
                goto yystate6
        }

yystate6:
        c = l.next()
        switch {
        default:
                goto yyrule4
        case c == '\t' || c == ' ':
                goto yystate6
        }

yystate7:
        c = l.next()
        switch {
        default:
                goto yyrule11
        case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate7
        }

yystate8:
        c = l.next()
        goto yyrule13

yystate9:
        c = l.next()
yystart9:
        switch {
        default:
                goto yyabort
        case c == 'H':
                goto yystate10
        case c == 'T':
                goto yystate15
        case c == '\t' || c == ' ':
                goto yystate3
        }

yystate10:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'E':
                goto yystate11
        }

yystate11:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'L':
                goto yystate12
        }

yystate12:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'P':
                goto yystate13
        }

yystate13:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '\t' || c == ' ':
                goto yystate14
        }

yystate14:
        c = l.next()
        switch {
        default:
                goto yyrule6
        case c == '\t' || c == ' ':
                goto yystate14
        }

yystate15:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'Y':
                goto yystate16
        }

yystate16:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'P':
                goto yystate17
        }

yystate17:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == 'E':
                goto yystate18
        }

yystate18:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '\t' || c == ' ':
                goto yystate19
        }

yystate19:
        c = l.next()
        switch {
        default:
                goto yyrule7
        case c == '\t' || c == ' ':
                goto yystate19
        }

yystate20:
        c = l.next()
yystart20:
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate21
        case c == ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate24
        case c == '\t' || c == ' ':
                goto yystate3
        }

yystate21:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate22
        case c == '\\':
                goto yystate23
        case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
                goto yystate21
        }

yystate22:
        c = l.next()
        goto yyrule8

yystate23:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
                goto yystate21
        }

yystate24:
        c = l.next()
        switch {
        default:
                goto yyrule9
        case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate24
        }

yystate25:
        c = l.next()
yystart25:
        switch {
        default:
                goto yyrule10
        case c == '\t' || c == ' ':
                goto yystate27
        case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate26
        }

yystate26:
        c = l.next()
        switch {
        default:
                goto yyrule10
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
                goto yystate26
        }

yystate27:
        c = l.next()
        switch {
        default:
                goto yyrule3
        case c == '\t' || c == ' ':
                goto yystate27
        case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
                goto yystate26
        }

yystate28:
        c = l.next()
yystart28:
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate29
        case c == ',':
                goto yystate32
        case c == '=':
                goto yystate33
        case c == '\t' || c == ' ':
                goto yystate3
        case c == '}':
                goto yystate35
        case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate34
        }

yystate29:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate30
        case c == '\\':
                goto yystate31
        case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
                goto yystate29
        }

yystate30:
        c = l.next()
        goto yyrule15

yystate31:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
                goto yystate29
        }

yystate32:
        c = l.next()
        goto yyrule18

yystate33:
        c = l.next()
        goto yyrule17

yystate34:
        c = l.next()
        switch {
        default:
                goto yyrule14
        case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
                goto yystate34
        }

yystate35:
        c = l.next()
        goto yyrule16

yystate36:
        c = l.next()
yystart36:
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate37
        case c == '\t' || c == ' ':
                goto yystate3
        }

yystate37:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c == '"':
                goto yystate38
        case c == '\\':
                goto yystate39
        case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
                goto yystate37
        }

yystate38:
        c = l.next()
        goto yyrule19

yystate39:
        c = l.next()
        switch {
        default:
                goto yyabort
        case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
                goto yystate37
        }

yystate40:
        c = l.next()
yystart40:
        switch {
        default:
                goto yyabort
        case c == '\t' || c == ' ':
                goto yystate3
        case c == '{':
                goto yystate42
        case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'z' || c >= '|' && c <= 'ÿ':
                goto yystate41
        }

yystate41:
        c = l.next()
        switch {
        default:
                goto yyrule20
        case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'z' || c >= '|' && c <= 'ÿ':
                goto yystate41
        }

yystate42:
        c = l.next()
        goto yyrule12

yystate43:
        c = l.next()
yystart43:
        switch {
        default:
                goto yyabort
        case c == '\n':
                goto yystate44
        case c == '\t' || c == ' ':
                goto yystate3
        case c >= '0' && c <= '9':
                goto yystate45
        }

yystate44:
        c = l.next()
        goto yyrule22

yystate45:
        c = l.next()
        switch {
        default:
                goto yyrule21
        case c >= '0' && c <= '9':
                goto yystate45
        }

yyrule1: // \0
        {
                return tEOF
        }
yyrule2: // \n
        {
                l.state = sInit
                return tLinebreak
                goto yystate0
        }
yyrule3: // [ \t]+
        {
                return tWhitespace
        }
yyrule4: // #[ \t]+
        {
                l.state = sComment
                goto yystate0
        }
yyrule5: // #
        {
                return l.consumeComment()
        }
yyrule6: // HELP[\t ]+
        {
                l.state = sMeta1
                return tHelp
                goto yystate0
        }
yyrule7: // TYPE[\t ]+
        {
                l.state = sMeta1
                return tType
                goto yystate0
        }
yyrule8: // \"(\\.|[^\\"])*\"
        {
                l.state = sMeta2
                return tMName
                goto yystate0
        }
yyrule9: // {M}({M}|{D})*
        {
                l.state = sMeta2
                return tMName
                goto yystate0
        }
yyrule10: // {C}*
        {
                l.state = sInit
                return tText
                goto yystate0
        }
yyrule11: // {M}({M}|{D})*
        {
                l.state = sValue
                return tMName
                goto yystate0
        }
yyrule12: // \{
        {
                l.state = sLabels
                return tBraceOpen
                goto yystate0
        }
yyrule13: // \{
        {
                l.state = sLabels
                return tBraceOpen
                goto yystate0
        }
yyrule14: // {L}({L}|{D})*
        {
                return tLName
        }
yyrule15: // \"(\\.|[^\\"])*\"
        {
                l.state = sLabels
                return tQString
                goto yystate0
        }
yyrule16: // \}
        {
                l.state = sValue
                return tBraceClose
                goto yystate0
        }
yyrule17: // =
        {
                l.state = sLValue
                return tEqual
                goto yystate0
        }
yyrule18: // ,
        {
                return tComma
        }
yyrule19: // \"(\\.|[^\\"])*\"
        {
                l.state = sLabels
                return tLValue
                goto yystate0
        }
yyrule20: // [^{ \t\n]+
        {
                l.state = sTimestamp
                return tValue
                goto yystate0
        }
yyrule21: // {D}+
        {
                return tTimestamp
        }
yyrule22: // \n
        if true { // avoid go vet determining the below panic will not be reached
                l.state = sInit
                return tLinebreak
                goto yystate0
        }
        panic("unreachable")

yyabort: // no lexem recognized
        // silence unused label errors for build and satisfy go vet reachability analysis
        {
                if false {
                        goto yyabort
                }
                if false {
                        goto yystate0
                }
                if false {
                        goto yystate1
                }
                if false {
                        goto yystate9
                }
                if false {
                        goto yystate20
                }
                if false {
                        goto yystate25
                }
                if false {
                        goto yystate28
                }
                if false {
                        goto yystate36
                }
                if false {
                        goto yystate40
                }
                if false {
                        goto yystate43
                }
        }

        // Workaround to gobble up comments that started with a HELP or TYPE
        // prefix. We just consume all characters until we reach a newline.
        // This saves us from adding disproportionate complexity to the parser.
        if l.state == sComment {
                return l.consumeComment()
        }
        return tInvalid
}

func (l *promlexer) consumeComment() token {
        for c := l.cur(); ; c = l.next() {
                switch c {
                case 0:
                        return tEOF
                case '\n':
                        l.state = sInit
                        return tComment
                }
        }
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:generate go get -u modernc.org/golex
//go:generate golex -o=promlex.l.go promlex.l

package textparse

import (
        "errors"
        "fmt"
        "io"
        "math"
        "strconv"
        "strings"
        "unicode/utf8"
        "unsafe"

        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/value"
)

type promlexer struct {
        b     []byte
        i     int
        start int
        err   error
        state int
}

type token int

const (
        tInvalid   token = -1
        tEOF       token = 0
        tLinebreak token = iota
        tWhitespace
        tHelp
        tType
        tUnit
        tEOFWord
        tText
        tComment
        tBlank
        tMName
        tQString
        tBraceOpen
        tBraceClose
        tLName
        tLValue
        tComma
        tEqual
        tTimestamp
        tValue
)

func (t token) String() string {
        switch t {
        case tInvalid:
                return "INVALID"
        case tEOF:
                return "EOF"
        case tLinebreak:
                return "LINEBREAK"
        case tWhitespace:
                return "WHITESPACE"
        case tHelp:
                return "HELP"
        case tType:
                return "TYPE"
        case tUnit:
                return "UNIT"
        case tEOFWord:
                return "EOFWORD"
        case tText:
                return "TEXT"
        case tComment:
                return "COMMENT"
        case tBlank:
                return "BLANK"
        case tMName:
                return "MNAME"
        case tQString:
                return "QSTRING"
        case tBraceOpen:
                return "BOPEN"
        case tBraceClose:
                return "BCLOSE"
        case tLName:
                return "LNAME"
        case tLValue:
                return "LVALUE"
        case tEqual:
                return "EQUAL"
        case tComma:
                return "COMMA"
        case tTimestamp:
                return "TIMESTAMP"
        case tValue:
                return "VALUE"
        }
        return fmt.Sprintf("<invalid: %d>", t)
}

// buf returns the buffer of the current token.
func (l *promlexer) buf() []byte {
        return l.b[l.start:l.i]
}

func (l *promlexer) cur() byte {
        return l.b[l.i]
}

// next advances the promlexer to the next character.
func (l *promlexer) next() byte {
        l.i++
        if l.i >= len(l.b) {
                l.err = io.EOF
                return byte(tEOF)
        }
        // Lex struggles with null bytes. If we are in a label value or help string, where
        // they are allowed, consume them here immediately.
        for l.b[l.i] == 0 && (l.state == sLValue || l.state == sMeta2 || l.state == sComment) {
                l.i++
        }
        return l.b[l.i]
}

func (l *promlexer) Error(es string) {
        l.err = errors.New(es)
}

// PromParser parses samples from a byte slice of samples in the official
// Prometheus text exposition format.
type PromParser struct {
        l       *promlexer
        builder labels.ScratchBuilder
        series  []byte
        text    []byte
        mtype   model.MetricType
        val     float64
        ts      int64
        hasTS   bool
        start   int
        // offsets is a list of offsets into series that describe the positions
        // of the metric name and label names and values for this series.
        // p.offsets[0] is the start character of the metric name.
        // p.offsets[1] is the end of the metric name.
        // Subsequently, p.offsets is a pair of pair of offsets for the positions
        // of the label name and value start and end characters.
        offsets []int
}

// NewPromParser returns a new parser of the byte slice.
func NewPromParser(b []byte, st *labels.SymbolTable) Parser {
        return &PromParser{
                l:       &promlexer{b: append(b, '\n')},
                builder: labels.NewScratchBuilderWithSymbolTable(st, 16),
        }
}

// Series returns the bytes of the series, the timestamp if set, and the value
// of the current sample.
func (p *PromParser) Series() ([]byte, *int64, float64) {
        if p.hasTS {
                return p.series, &p.ts, p.val
        }
        return p.series, nil, p.val
}

// Histogram returns (nil, nil, nil, nil) for now because the Prometheus text
// format does not support sparse histograms yet.
func (p *PromParser) Histogram() ([]byte, *int64, *histogram.Histogram, *histogram.FloatHistogram) {
        return nil, nil, nil, nil
}

// Help returns the metric name and help text in the current entry.
// Must only be called after Next returned a help entry.
// The returned byte slices become invalid after the next call to Next.
func (p *PromParser) Help() ([]byte, []byte) {
        m := p.l.b[p.offsets[0]:p.offsets[1]]

        // Replacer causes allocations. Replace only when necessary.
        if strings.IndexByte(yoloString(p.text), byte('\\')) >= 0 {
                return m, []byte(helpReplacer.Replace(string(p.text)))
        }
        return m, p.text
}

// Type returns the metric name and type in the current entry.
// Must only be called after Next returned a type entry.
// The returned byte slices become invalid after the next call to Next.
func (p *PromParser) Type() ([]byte, model.MetricType) {
        return p.l.b[p.offsets[0]:p.offsets[1]], p.mtype
}

// Unit returns the metric name and unit in the current entry.
// Must only be called after Next returned a unit entry.
// The returned byte slices become invalid after the next call to Next.
func (p *PromParser) Unit() ([]byte, []byte) {
        // The Prometheus format does not have units.
        return nil, nil
}

// Comment returns the text of the current comment.
// Must only be called after Next returned a comment entry.
// The returned byte slice becomes invalid after the next call to Next.
func (p *PromParser) Comment() []byte {
        return p.text
}

// Metric writes the labels of the current sample into the passed labels.
// It returns the string from which the metric was parsed.
func (p *PromParser) Metric(l *labels.Labels) string {
        // Copy the buffer to a string: this is only necessary for the return value.
        s := string(p.series)

        p.builder.Reset()
        metricName := unreplace(s[p.offsets[0]-p.start : p.offsets[1]-p.start])
        p.builder.Add(labels.MetricName, metricName)

        for i := 2; i < len(p.offsets); i += 4 {
                a := p.offsets[i] - p.start
                b := p.offsets[i+1] - p.start
                label := unreplace(s[a:b])
                c := p.offsets[i+2] - p.start
                d := p.offsets[i+3] - p.start
                value := unreplace(s[c:d])
                p.builder.Add(label, value)
        }

        p.builder.Sort()
        *l = p.builder.Labels()

        return s
}

// Exemplar implements the Parser interface. However, since the classic
// Prometheus text format does not support exemplars, this implementation simply
// returns false and does nothing else.
func (p *PromParser) Exemplar(*exemplar.Exemplar) bool {
        return false
}

// CreatedTimestamp returns nil as it's not implemented yet.
// TODO(bwplotka): https://github.com/prometheus/prometheus/issues/12980
func (p *PromParser) CreatedTimestamp() *int64 {
        return nil
}

// nextToken returns the next token from the promlexer. It skips over tabs
// and spaces.
func (p *PromParser) nextToken() token {
        for {
                if tok := p.l.Lex(); tok != tWhitespace {
                        return tok
                }
        }
}

func (p *PromParser) parseError(exp string, got token) error {
        e := p.l.i + 1
        if len(p.l.b) < e {
                e = len(p.l.b)
        }
        return fmt.Errorf("%s, got %q (%q) while parsing: %q", exp, p.l.b[p.l.start:e], got, p.l.b[p.start:e])
}

// Next advances the parser to the next sample.
// It returns (EntryInvalid, io.EOF) if no samples were read.
func (p *PromParser) Next() (Entry, error) {
        var err error

        p.start = p.l.i
        p.offsets = p.offsets[:0]

        switch t := p.nextToken(); t {
        case tEOF:
                return EntryInvalid, io.EOF
        case tLinebreak:
                // Allow full blank lines.
                return p.Next()

        case tHelp, tType:
                switch t2 := p.nextToken(); t2 {
                case tMName:
                        mStart := p.l.start
                        mEnd := p.l.i
                        if p.l.b[mStart] == '"' && p.l.b[mEnd-1] == '"' {
                                mStart++
                                mEnd--
                        }
                        p.offsets = append(p.offsets, mStart, mEnd)
                default:
                        return EntryInvalid, p.parseError("expected metric name after "+t.String(), t2)
                }
                switch t2 := p.nextToken(); t2 {
                case tText:
                        if len(p.l.buf()) > 1 {
                                p.text = p.l.buf()[1:]
                        } else {
                                p.text = []byte{}
                        }
                default:
                        return EntryInvalid, fmt.Errorf("expected text in %s, got %v", t.String(), t2.String())
                }
                switch t {
                case tType:
                        switch s := yoloString(p.text); s {
                        case "counter":
                                p.mtype = model.MetricTypeCounter
                        case "gauge":
                                p.mtype = model.MetricTypeGauge
                        case "histogram":
                                p.mtype = model.MetricTypeHistogram
                        case "summary":
                                p.mtype = model.MetricTypeSummary
                        case "untyped":
                                p.mtype = model.MetricTypeUnknown
                        default:
                                return EntryInvalid, fmt.Errorf("invalid metric type %q", s)
                        }
                case tHelp:
                        if !utf8.Valid(p.text) {
                                return EntryInvalid, fmt.Errorf("help text %q is not a valid utf8 string", p.text)
                        }
                }
                if t := p.nextToken(); t != tLinebreak {
                        return EntryInvalid, p.parseError("linebreak expected after metadata", t)
                }
                switch t {
                case tHelp:
                        return EntryHelp, nil
                case tType:
                        return EntryType, nil
                }
        case tComment:
                p.text = p.l.buf()
                if t := p.nextToken(); t != tLinebreak {
                        return EntryInvalid, p.parseError("linebreak expected after comment", t)
                }
                return EntryComment, nil
        case tBraceOpen:
                // We found a brace, so make room for the eventual metric name. If these
                // values aren't updated, then the metric name was not set inside the
                // braces and we can return an error.
                if len(p.offsets) == 0 {
                        p.offsets = []int{-1, -1}
                }
                if err := p.parseLVals(); err != nil {
                        return EntryInvalid, err
                }

                p.series = p.l.b[p.start:p.l.i]
                return p.parseMetricSuffix(p.nextToken())
        case tMName:
                p.offsets = append(p.offsets, p.start, p.l.i)
                p.series = p.l.b[p.start:p.l.i]
                t2 := p.nextToken()
                // If there's a brace, consume and parse the label values.
                if t2 == tBraceOpen {
                        if err := p.parseLVals(); err != nil {
                                return EntryInvalid, err
                        }
                        p.series = p.l.b[p.start:p.l.i]
                        t2 = p.nextToken()
                }
                return p.parseMetricSuffix(t2)

        default:
                err = p.parseError("expected a valid start token", t)
        }
        return EntryInvalid, err
}

// parseLVals parses the contents inside the braces.
func (p *PromParser) parseLVals() error {
        t := p.nextToken()
        for {
                curTStart := p.l.start
                curTI := p.l.i
                switch t {
                case tBraceClose:
                        return nil
                case tLName:
                case tQString:
                default:
                        return p.parseError("expected label name", t)
                }

                t = p.nextToken()
                // A quoted string followed by a comma or brace is a metric name. Set the
                // offsets and continue processing.
                if t == tComma || t == tBraceClose {
                        if p.offsets[0] != -1 || p.offsets[1] != -1 {
                                return fmt.Errorf("metric name already set while parsing: %q", p.l.b[p.start:p.l.i])
                        }
                        p.offsets[0] = curTStart + 1
                        p.offsets[1] = curTI - 1
                        if t == tBraceClose {
                                return nil
                        }
                        t = p.nextToken()
                        continue
                }
                // We have a label name, and it might be quoted.
                if p.l.b[curTStart] == '"' {
                        curTStart++
                        curTI--
                }
                p.offsets = append(p.offsets, curTStart, curTI)
                if t != tEqual {
                        return p.parseError("expected equal", t)
                }
                if t := p.nextToken(); t != tLValue {
                        return p.parseError("expected label value", t)
                }
                if !utf8.Valid(p.l.buf()) {
                        return fmt.Errorf("invalid UTF-8 label value: %q", p.l.buf())
                }

                // The promlexer ensures the value string is quoted. Strip first
                // and last character.
                p.offsets = append(p.offsets, p.l.start+1, p.l.i-1)

                // Free trailing commas are allowed. NOTE: this allows spaces between label
                // names, unlike in OpenMetrics. It is not clear if this is intended or an
                // accidental bug.
                if t = p.nextToken(); t == tComma {
                        t = p.nextToken()
                }
        }
}

// parseMetricSuffix parses the end of the line after the metric name and
// labels. It starts parsing with the provided token.
func (p *PromParser) parseMetricSuffix(t token) (Entry, error) {
        if p.offsets[0] == -1 {
                return EntryInvalid, fmt.Errorf("metric name not set while parsing: %q", p.l.b[p.start:p.l.i])
        }
        if t != tValue {
                return EntryInvalid, p.parseError("expected value after metric", t)
        }
        var err error
        if p.val, err = parseFloat(yoloString(p.l.buf())); err != nil {
                return EntryInvalid, fmt.Errorf("%w while parsing: %q", err, p.l.b[p.start:p.l.i])
        }
        // Ensure canonical NaN value.
        if math.IsNaN(p.val) {
                p.val = math.Float64frombits(value.NormalNaN)
        }
        p.hasTS = false
        switch t := p.nextToken(); t {
        case tLinebreak:
                break
        case tTimestamp:
                p.hasTS = true
                if p.ts, err = strconv.ParseInt(yoloString(p.l.buf()), 10, 64); err != nil {
                        return EntryInvalid, fmt.Errorf("%w while parsing: %q", err, p.l.b[p.start:p.l.i])
                }
                if t2 := p.nextToken(); t2 != tLinebreak {
                        return EntryInvalid, p.parseError("expected next entry after timestamp", t2)
                }
        default:
                return EntryInvalid, p.parseError("expected timestamp or new record", t)
        }

        return EntrySeries, nil
}

var lvalReplacer = strings.NewReplacer(
        `\"`, "\"",
        `\\`, "\\",
        `\n`, "\n",
)

var helpReplacer = strings.NewReplacer(
        `\\`, "\\",
        `\n`, "\n",
)

func unreplace(s string) string {
        // Replacer causes allocations. Replace only when necessary.
        if strings.IndexByte(s, byte('\\')) >= 0 {
                return lvalReplacer.Replace(s)
        }
        return s
}

func yoloString(b []byte) string {
        return *((*string)(unsafe.Pointer(&b)))
}

func parseFloat(s string) (float64, error) {
        // Keep to pre-Go 1.13 float formats.
        if strings.ContainsAny(s, "pP_") {
                return 0, fmt.Errorf("unsupported character in float")
        }
        return strconv.ParseFloat(s, 64)
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package textparse

import (
        "bytes"
        "encoding/binary"
        "errors"
        "fmt"
        "io"
        "math"
        "strings"
        "unicode/utf8"

        "github.com/gogo/protobuf/proto"
        "github.com/gogo/protobuf/types"
        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"

        dto "github.com/prometheus/prometheus/prompb/io/prometheus/client"
)

// ProtobufParser is a very inefficient way of unmarshaling the old Prometheus
// protobuf format and then present it as it if were parsed by a
// Prometheus-2-style text parser. This is only done so that we can easily plug
// in the protobuf format into Prometheus 2. For future use (with the final
// format that will be used for native histograms), we have to revisit the
// parsing. A lot of the efficiency tricks of the Prometheus-2-style parsing
// could be used in a similar fashion (byte-slice pointers into the raw
// payload), which requires some hand-coded protobuf handling. But the current
// parsers all expect the full series name (metric name plus label pairs) as one
// string, which is not how things are represented in the protobuf format. If
// the re-arrangement work is actually causing problems (which has to be seen),
// that expectation needs to be changed.
type ProtobufParser struct {
        in        []byte // The intput to parse.
        inPos     int    // Position within the input.
        metricPos int    // Position within Metric slice.
        // fieldPos is the position within a Summary or (legacy) Histogram. -2
        // is the count. -1 is the sum. Otherwise it is the index within
        // quantiles/buckets.
        fieldPos    int
        fieldsDone  bool // true if no more fields of a Summary or (legacy) Histogram to be processed.
        redoClassic bool // true after parsing a native histogram if we need to parse it again as a classic histogram.
        // exemplarPos is the position within the exemplars slice of a native histogram.
        exemplarPos int

        // exemplarReturned is set to true each time an exemplar has been
        // returned, and set back to false upon each Next() call.
        exemplarReturned bool

        // state is marked by the entry we are processing. EntryInvalid implies
        // that we have to decode the next MetricFamily.
        state Entry

        builder labels.ScratchBuilder // held here to reduce allocations when building Labels

        mf *dto.MetricFamily

        // Wether to also parse a classic histogram that is also present as a
        // native histogram.
        parseClassicHistograms bool

        // The following are just shenanigans to satisfy the Parser interface.
        metricBytes *bytes.Buffer // A somewhat fluid representation of the current metric.
}

// NewProtobufParser returns a parser for the payload in the byte slice.
func NewProtobufParser(b []byte, parseClassicHistograms bool, st *labels.SymbolTable) Parser {
        return &ProtobufParser{
                in:                     b,
                state:                  EntryInvalid,
                mf:                     &dto.MetricFamily{},
                metricBytes:            &bytes.Buffer{},
                parseClassicHistograms: parseClassicHistograms,
                builder:                labels.NewScratchBuilderWithSymbolTable(st, 16),
        }
}

// Series returns the bytes of a series with a simple float64 as a
// value, the timestamp if set, and the value of the current sample.
func (p *ProtobufParser) Series() ([]byte, *int64, float64) {
        var (
                m  = p.mf.GetMetric()[p.metricPos]
                ts = m.GetTimestampMs()
                v  float64
        )
        switch p.mf.GetType() {
        case dto.MetricType_COUNTER:
                v = m.GetCounter().GetValue()
        case dto.MetricType_GAUGE:
                v = m.GetGauge().GetValue()
        case dto.MetricType_UNTYPED:
                v = m.GetUntyped().GetValue()
        case dto.MetricType_SUMMARY:
                s := m.GetSummary()
                switch p.fieldPos {
                case -2:
                        v = float64(s.GetSampleCount())
                case -1:
                        v = s.GetSampleSum()
                        // Need to detect summaries without quantile here.
                        if len(s.GetQuantile()) == 0 {
                                p.fieldsDone = true
                        }
                default:
                        v = s.GetQuantile()[p.fieldPos].GetValue()
                }
        case dto.MetricType_HISTOGRAM, dto.MetricType_GAUGE_HISTOGRAM:
                // This should only happen for a classic histogram.
                h := m.GetHistogram()
                switch p.fieldPos {
                case -2:
                        v = h.GetSampleCountFloat()
                        if v == 0 {
                                v = float64(h.GetSampleCount())
                        }
                case -1:
                        v = h.GetSampleSum()
                default:
                        bb := h.GetBucket()
                        if p.fieldPos >= len(bb) {
                                v = h.GetSampleCountFloat()
                                if v == 0 {
                                        v = float64(h.GetSampleCount())
                                }
                        } else {
                                v = bb[p.fieldPos].GetCumulativeCountFloat()
                                if v == 0 {
                                        v = float64(bb[p.fieldPos].GetCumulativeCount())
                                }
                        }
                }
        default:
                panic("encountered unexpected metric type, this is a bug")
        }
        if ts != 0 {
                return p.metricBytes.Bytes(), &ts, v
        }
        // TODO(beorn7): We assume here that ts==0 means no timestamp. That's
        // not true in general, but proto3 originally has no distinction between
        // unset and default. At a later stage, the `optional` keyword was
        // (re-)introduced in proto3, but gogo-protobuf never got updated to
        // support it. (Note that setting `[(gogoproto.nullable) = true]` for
        // the `timestamp_ms` field doesn't help, either.) We plan to migrate
        // away from gogo-protobuf to an actively maintained protobuf
        // implementation. Once that's done, we can simply use the `optional`
        // keyword and check for the unset state explicitly.
        return p.metricBytes.Bytes(), nil, v
}

// Histogram returns the bytes of a series with a native histogram as a value,
// the timestamp if set, and the native histogram in the current sample.
//
// The Compact method is called before returning the Histogram (or FloatHistogram).
//
// If the SampleCountFloat or the ZeroCountFloat in the proto message is > 0,
// the histogram is parsed and returned as a FloatHistogram and nil is returned
// as the (integer) Histogram return value. Otherwise, it is parsed and returned
// as an (integer) Histogram and nil is returned as the FloatHistogram return
// value.
func (p *ProtobufParser) Histogram() ([]byte, *int64, *histogram.Histogram, *histogram.FloatHistogram) {
        var (
                m  = p.mf.GetMetric()[p.metricPos]
                ts = m.GetTimestampMs()
                h  = m.GetHistogram()
        )
        if p.parseClassicHistograms && len(h.GetBucket()) > 0 {
                p.redoClassic = true
        }
        if h.GetSampleCountFloat() > 0 || h.GetZeroCountFloat() > 0 {
                // It is a float histogram.
                fh := histogram.FloatHistogram{
                        Count:           h.GetSampleCountFloat(),
                        Sum:             h.GetSampleSum(),
                        ZeroThreshold:   h.GetZeroThreshold(),
                        ZeroCount:       h.GetZeroCountFloat(),
                        Schema:          h.GetSchema(),
                        PositiveSpans:   make([]histogram.Span, len(h.GetPositiveSpan())),
                        PositiveBuckets: h.GetPositiveCount(),
                        NegativeSpans:   make([]histogram.Span, len(h.GetNegativeSpan())),
                        NegativeBuckets: h.GetNegativeCount(),
                }
                for i, span := range h.GetPositiveSpan() {
                        fh.PositiveSpans[i].Offset = span.GetOffset()
                        fh.PositiveSpans[i].Length = span.GetLength()
                }
                for i, span := range h.GetNegativeSpan() {
                        fh.NegativeSpans[i].Offset = span.GetOffset()
                        fh.NegativeSpans[i].Length = span.GetLength()
                }
                if p.mf.GetType() == dto.MetricType_GAUGE_HISTOGRAM {
                        fh.CounterResetHint = histogram.GaugeType
                }
                fh.Compact(0)
                if ts != 0 {
                        return p.metricBytes.Bytes(), &ts, nil, &fh
                }
                // Nasty hack: Assume that ts==0 means no timestamp. That's not true in
                // general, but proto3 has no distinction between unset and
                // default. Need to avoid in the final format.
                return p.metricBytes.Bytes(), nil, nil, &fh
        }

        sh := histogram.Histogram{
                Count:           h.GetSampleCount(),
                Sum:             h.GetSampleSum(),
                ZeroThreshold:   h.GetZeroThreshold(),
                ZeroCount:       h.GetZeroCount(),
                Schema:          h.GetSchema(),
                PositiveSpans:   make([]histogram.Span, len(h.GetPositiveSpan())),
                PositiveBuckets: h.GetPositiveDelta(),
                NegativeSpans:   make([]histogram.Span, len(h.GetNegativeSpan())),
                NegativeBuckets: h.GetNegativeDelta(),
        }
        for i, span := range h.GetPositiveSpan() {
                sh.PositiveSpans[i].Offset = span.GetOffset()
                sh.PositiveSpans[i].Length = span.GetLength()
        }
        for i, span := range h.GetNegativeSpan() {
                sh.NegativeSpans[i].Offset = span.GetOffset()
                sh.NegativeSpans[i].Length = span.GetLength()
        }
        if p.mf.GetType() == dto.MetricType_GAUGE_HISTOGRAM {
                sh.CounterResetHint = histogram.GaugeType
        }
        sh.Compact(0)
        if ts != 0 {
                return p.metricBytes.Bytes(), &ts, &sh, nil
        }
        return p.metricBytes.Bytes(), nil, &sh, nil
}

// Help returns the metric name and help text in the current entry.
// Must only be called after Next returned a help entry.
// The returned byte slices become invalid after the next call to Next.
func (p *ProtobufParser) Help() ([]byte, []byte) {
        return p.metricBytes.Bytes(), []byte(p.mf.GetHelp())
}

// Type returns the metric name and type in the current entry.
// Must only be called after Next returned a type entry.
// The returned byte slices become invalid after the next call to Next.
func (p *ProtobufParser) Type() ([]byte, model.MetricType) {
        n := p.metricBytes.Bytes()
        switch p.mf.GetType() {
        case dto.MetricType_COUNTER:
                return n, model.MetricTypeCounter
        case dto.MetricType_GAUGE:
                return n, model.MetricTypeGauge
        case dto.MetricType_HISTOGRAM:
                return n, model.MetricTypeHistogram
        case dto.MetricType_GAUGE_HISTOGRAM:
                return n, model.MetricTypeGaugeHistogram
        case dto.MetricType_SUMMARY:
                return n, model.MetricTypeSummary
        }
        return n, model.MetricTypeUnknown
}

// Unit returns the metric unit in the current entry.
// Must only be called after Next returned a unit entry.
// The returned byte slices become invalid after the next call to Next.
func (p *ProtobufParser) Unit() ([]byte, []byte) {
        return p.metricBytes.Bytes(), []byte(p.mf.GetUnit())
}

// Comment always returns nil because comments aren't supported by the protobuf
// format.
func (p *ProtobufParser) Comment() []byte {
        return nil
}

// Metric writes the labels of the current sample into the passed labels.
// It returns the string from which the metric was parsed.
func (p *ProtobufParser) Metric(l *labels.Labels) string {
        p.builder.Reset()
        p.builder.Add(labels.MetricName, p.getMagicName())

        for _, lp := range p.mf.GetMetric()[p.metricPos].GetLabel() {
                p.builder.Add(lp.GetName(), lp.GetValue())
        }
        if needed, name, value := p.getMagicLabel(); needed {
                p.builder.Add(name, value)
        }

        // Sort labels to maintain the sorted labels invariant.
        p.builder.Sort()
        *l = p.builder.Labels()

        return p.metricBytes.String()
}

// Exemplar writes the exemplar of the current sample into the passed
// exemplar. It returns if an exemplar exists or not. In case of a native
// histogram, the exemplars in the native histogram will be returned.
// If this field is empty, the classic bucket section is still used for exemplars.
// To ingest all exemplars, call the Exemplar method repeatedly until it returns false.
func (p *ProtobufParser) Exemplar(ex *exemplar.Exemplar) bool {
        if p.exemplarReturned && p.state == EntrySeries {
                // We only ever return one exemplar per (non-native-histogram) series.
                return false
        }
        m := p.mf.GetMetric()[p.metricPos]
        var exProto *dto.Exemplar
        switch p.mf.GetType() {
        case dto.MetricType_COUNTER:
                exProto = m.GetCounter().GetExemplar()
        case dto.MetricType_HISTOGRAM, dto.MetricType_GAUGE_HISTOGRAM:
                isClassic := p.state == EntrySeries
                if !isClassic && len(m.GetHistogram().GetExemplars()) > 0 {
                        exs := m.GetHistogram().GetExemplars()
                        for p.exemplarPos < len(exs) {
                                exProto = exs[p.exemplarPos]
                                p.exemplarPos++
                                if exProto != nil && exProto.GetTimestamp() != nil {
                                        break
                                }
                        }
                        if exProto != nil && exProto.GetTimestamp() == nil {
                                return false
                        }
                } else {
                        bb := m.GetHistogram().GetBucket()
                        if p.fieldPos < 0 {
                                if isClassic {
                                        return false // At _count or _sum.
                                }
                                p.fieldPos = 0 // Start at 1st bucket for native histograms.
                        }
                        for p.fieldPos < len(bb) {
                                exProto = bb[p.fieldPos].GetExemplar()
                                if isClassic {
                                        break
                                }
                                p.fieldPos++
                                // We deliberately drop exemplars with no timestamp only for native histograms.
                                if exProto != nil && (isClassic || exProto.GetTimestamp() != nil) {
                                        break // Found a classic histogram exemplar or a native histogram exemplar with a timestamp.
                                }
                        }
                        // If the last exemplar for native histograms has no timestamp, ignore it.
                        if !isClassic && exProto.GetTimestamp() == nil {
                                return false
                        }
                }
        default:
                return false
        }
        if exProto == nil {
                return false
        }
        ex.Value = exProto.GetValue()
        if ts := exProto.GetTimestamp(); ts != nil {
                ex.HasTs = true
                ex.Ts = ts.GetSeconds()*1000 + int64(ts.GetNanos()/1_000_000)
        }
        p.builder.Reset()
        for _, lp := range exProto.GetLabel() {
                p.builder.Add(lp.GetName(), lp.GetValue())
        }
        p.builder.Sort()
        ex.Labels = p.builder.Labels()
        p.exemplarReturned = true
        return true
}

// CreatedTimestamp returns CT or nil if CT is not present or
// invalid (as timestamp e.g. negative value) on counters, summaries or histograms.
func (p *ProtobufParser) CreatedTimestamp() *int64 {
        var ct *types.Timestamp
        switch p.mf.GetType() {
        case dto.MetricType_COUNTER:
                ct = p.mf.GetMetric()[p.metricPos].GetCounter().GetCreatedTimestamp()
        case dto.MetricType_SUMMARY:
                ct = p.mf.GetMetric()[p.metricPos].GetSummary().GetCreatedTimestamp()
        case dto.MetricType_HISTOGRAM, dto.MetricType_GAUGE_HISTOGRAM:
                ct = p.mf.GetMetric()[p.metricPos].GetHistogram().GetCreatedTimestamp()
        default:
        }
        ctAsTime, err := types.TimestampFromProto(ct)
        if err != nil {
                // Errors means ct == nil or invalid timestamp, which we silently ignore.
                return nil
        }
        ctMilis := ctAsTime.UnixMilli()
        return &ctMilis
}

// Next advances the parser to the next "sample" (emulating the behavior of a
// text format parser). It returns (EntryInvalid, io.EOF) if no samples were
// read.
func (p *ProtobufParser) Next() (Entry, error) {
        p.exemplarReturned = false
        switch p.state {
        case EntryInvalid:
                p.metricPos = 0
                p.fieldPos = -2
                n, err := readDelimited(p.in[p.inPos:], p.mf)
                p.inPos += n
                if err != nil {
                        return p.state, err
                }

                // Skip empty metric families.
                if len(p.mf.GetMetric()) == 0 {
                        return p.Next()
                }

                // We are at the beginning of a metric family. Put only the name
                // into metricBytes and validate only name, help, and type for now.
                name := p.mf.GetName()
                if !model.IsValidMetricName(model.LabelValue(name)) {
                        return EntryInvalid, fmt.Errorf("invalid metric name: %s", name)
                }
                if help := p.mf.GetHelp(); !utf8.ValidString(help) {
                        return EntryInvalid, fmt.Errorf("invalid help for metric %q: %s", name, help)
                }
                switch p.mf.GetType() {
                case dto.MetricType_COUNTER,
                        dto.MetricType_GAUGE,
                        dto.MetricType_HISTOGRAM,
                        dto.MetricType_GAUGE_HISTOGRAM,
                        dto.MetricType_SUMMARY,
                        dto.MetricType_UNTYPED:
                        // All good.
                default:
                        return EntryInvalid, fmt.Errorf("unknown metric type for metric %q: %s", name, p.mf.GetType())
                }
                unit := p.mf.GetUnit()
                if len(unit) > 0 {
                        if p.mf.GetType() == dto.MetricType_COUNTER && strings.HasSuffix(name, "_total") {
                                if !strings.HasSuffix(name[:len(name)-6], unit) || len(name)-6 < len(unit)+1 || name[len(name)-6-len(unit)-1] != '_' {
                                        return EntryInvalid, fmt.Errorf("unit %q not a suffix of counter %q", unit, name)
                                }
                        } else if !strings.HasSuffix(name, unit) || len(name) < len(unit)+1 || name[len(name)-len(unit)-1] != '_' {
                                return EntryInvalid, fmt.Errorf("unit %q not a suffix of metric %q", unit, name)
                        }
                }
                p.metricBytes.Reset()
                p.metricBytes.WriteString(name)

                p.state = EntryHelp
        case EntryHelp:
                p.state = EntryType
        case EntryType:
                t := p.mf.GetType()
                if (t == dto.MetricType_HISTOGRAM || t == dto.MetricType_GAUGE_HISTOGRAM) &&
                        isNativeHistogram(p.mf.GetMetric()[0].GetHistogram()) {
                        p.state = EntryHistogram
                } else {
                        p.state = EntrySeries
                }
                if err := p.updateMetricBytes(); err != nil {
                        return EntryInvalid, err
                }
        case EntryHistogram, EntrySeries:
                if p.redoClassic {
                        p.redoClassic = false
                        p.state = EntrySeries
                        p.fieldPos = -3
                        p.fieldsDone = false
                }
                t := p.mf.GetType()
                if p.state == EntrySeries && !p.fieldsDone &&
                        (t == dto.MetricType_SUMMARY ||
                                t == dto.MetricType_HISTOGRAM ||
                                t == dto.MetricType_GAUGE_HISTOGRAM) {
                        p.fieldPos++
                } else {
                        p.metricPos++
                        p.fieldPos = -2
                        p.fieldsDone = false
                        // If this is a metric family containing native
                        // histograms, we have to switch back to native
                        // histograms after parsing a classic histogram.
                        if p.state == EntrySeries &&
                                (t == dto.MetricType_HISTOGRAM || t == dto.MetricType_GAUGE_HISTOGRAM) &&
                                isNativeHistogram(p.mf.GetMetric()[0].GetHistogram()) {
                                p.state = EntryHistogram
                        }
                }
                if p.metricPos >= len(p.mf.GetMetric()) {
                        p.state = EntryInvalid
                        return p.Next()
                }
                if err := p.updateMetricBytes(); err != nil {
                        return EntryInvalid, err
                }
        default:
                return EntryInvalid, fmt.Errorf("invalid protobuf parsing state: %d", p.state)
        }
        return p.state, nil
}

func (p *ProtobufParser) updateMetricBytes() error {
        b := p.metricBytes
        b.Reset()
        b.WriteString(p.getMagicName())
        for _, lp := range p.mf.GetMetric()[p.metricPos].GetLabel() {
                b.WriteByte(model.SeparatorByte)
                n := lp.GetName()
                if !model.LabelName(n).IsValid() {
                        return fmt.Errorf("invalid label name: %s", n)
                }
                b.WriteString(n)
                b.WriteByte(model.SeparatorByte)
                v := lp.GetValue()
                if !utf8.ValidString(v) {
                        return fmt.Errorf("invalid label value: %s", v)
                }
                b.WriteString(v)
        }
        if needed, n, v := p.getMagicLabel(); needed {
                b.WriteByte(model.SeparatorByte)
                b.WriteString(n)
                b.WriteByte(model.SeparatorByte)
                b.WriteString(v)
        }
        return nil
}

// getMagicName usually just returns p.mf.GetType() but adds a magic suffix
// ("_count", "_sum", "_bucket") if needed according to the current parser
// state.
func (p *ProtobufParser) getMagicName() string {
        t := p.mf.GetType()
        if p.state == EntryHistogram || (t != dto.MetricType_HISTOGRAM && t != dto.MetricType_GAUGE_HISTOGRAM && t != dto.MetricType_SUMMARY) {
                return p.mf.GetName()
        }
        if p.fieldPos == -2 {
                return p.mf.GetName() + "_count"
        }
        if p.fieldPos == -1 {
                return p.mf.GetName() + "_sum"
        }
        if t == dto.MetricType_HISTOGRAM || t == dto.MetricType_GAUGE_HISTOGRAM {
                return p.mf.GetName() + "_bucket"
        }
        return p.mf.GetName()
}

// getMagicLabel returns if a magic label ("quantile" or "le") is needed and, if
// so, its name and value. It also sets p.fieldsDone if applicable.
func (p *ProtobufParser) getMagicLabel() (bool, string, string) {
        if p.state == EntryHistogram || p.fieldPos < 0 {
                return false, "", ""
        }
        switch p.mf.GetType() {
        case dto.MetricType_SUMMARY:
                qq := p.mf.GetMetric()[p.metricPos].GetSummary().GetQuantile()
                q := qq[p.fieldPos]
                p.fieldsDone = p.fieldPos == len(qq)-1
                return true, model.QuantileLabel, formatOpenMetricsFloat(q.GetQuantile())
        case dto.MetricType_HISTOGRAM, dto.MetricType_GAUGE_HISTOGRAM:
                bb := p.mf.GetMetric()[p.metricPos].GetHistogram().GetBucket()
                if p.fieldPos >= len(bb) {
                        p.fieldsDone = true
                        return true, model.BucketLabel, "+Inf"
                }
                b := bb[p.fieldPos]
                p.fieldsDone = math.IsInf(b.GetUpperBound(), +1)
                return true, model.BucketLabel, formatOpenMetricsFloat(b.GetUpperBound())
        }
        return false, "", ""
}

var errInvalidVarint = errors.New("protobufparse: invalid varint encountered")

// readDelimited is essentially doing what the function of the same name in
// github.com/matttproud/golang_protobuf_extensions/pbutil is doing, but it is
// specific to a MetricFamily, utilizes the more efficient gogo-protobuf
// unmarshaling, and acts on a byte slice directly without any additional
// staging buffers.
func readDelimited(b []byte, mf *dto.MetricFamily) (n int, err error) {
        if len(b) == 0 {
                return 0, io.EOF
        }
        messageLength, varIntLength := proto.DecodeVarint(b)
        if varIntLength == 0 || varIntLength > binary.MaxVarintLen32 {
                return 0, errInvalidVarint
        }
        totalLength := varIntLength + int(messageLength)
        if totalLength > len(b) {
                return 0, fmt.Errorf("protobufparse: insufficient length of buffer, expected at least %d bytes, got %d bytes", totalLength, len(b))
        }
        mf.Reset()
        return totalLength, mf.Unmarshal(b[varIntLength:totalLength])
}

// formatOpenMetricsFloat works like the usual Go string formatting of a fleat
// but appends ".0" if the resulting number would otherwise contain neither a
// "." nor an "e".
func formatOpenMetricsFloat(f float64) string {
        // A few common cases hardcoded.
        switch {
        case f == 1:
                return "1.0"
        case f == 0:
                return "0.0"
        case f == -1:
                return "-1.0"
        case math.IsNaN(f):
                return "NaN"
        case math.IsInf(f, +1):
                return "+Inf"
        case math.IsInf(f, -1):
                return "-Inf"
        }
        s := fmt.Sprint(f)
        if strings.ContainsAny(s, "e.") {
                return s
        }
        return s + ".0"
}

// isNativeHistogram returns false iff the provided histograms has no spans at
// all (neither positive nor negative) and a zero threshold of 0 and a zero
// count of 0. In principle, this could still be meant to be a native histogram
// with a zero threshold of 0 and no observations yet. In that case,
// instrumentation libraries should add a "no-op" span (e.g. length zero, offset
// zero) to signal that the histogram is meant to be parsed as a native
// histogram. Failing to do so will cause Prometheus to parse it as a classic
// histogram as long as no observations have happened.
func isNativeHistogram(h *dto.Histogram) bool {
        return len(h.GetPositiveSpan()) > 0 ||
                len(h.GetNegativeSpan()) > 0 ||
                h.GetZeroThreshold() > 0 ||
                h.GetZeroCount() > 0
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package timestamp

import (
        "math"
        "time"
)

// FromTime returns a new millisecond timestamp from a time.
func FromTime(t time.Time) int64 {
        return t.Unix()*1000 + int64(t.Nanosecond())/int64(time.Millisecond)
}

// Time returns a new time.Time object from a millisecond timestamp.
func Time(ts int64) time.Time {
        return time.Unix(ts/1000, (ts%1000)*int64(time.Millisecond)).UTC()
}

// FromFloatSeconds returns a millisecond timestamp from float seconds.
func FromFloatSeconds(ts float64) int64 {
        return int64(math.Round(ts * 1000))
}

// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package value

import (
        "math"
)

const (
        // NormalNaN is a quiet NaN. This is also math.NaN().
        NormalNaN uint64 = 0x7ff8000000000001

        // StaleNaN is a signaling NaN, due to the MSB of the mantissa being 0.
        // This value is chosen with many leading 0s, so we have scope to store more
        // complicated values in the future. It is 2 rather than 1 to make
        // it easier to distinguish from the NormalNaN by a human when debugging.
        StaleNaN uint64 = 0x7ff0000000000002
)

// IsStaleNaN returns true when the provided NaN value is a stale marker.
func IsStaleNaN(v float64) bool {
        return math.Float64bits(v) == StaleNaN
}

// Code generated by protoc-gen-gogo. DO NOT EDIT.
// source: io/prometheus/client/metrics.proto

package io_prometheus_client

import (
        encoding_binary "encoding/binary"
        fmt "fmt"
        io "io"
        math "math"
        math_bits "math/bits"

        _ "github.com/gogo/protobuf/gogoproto"
        proto "github.com/gogo/protobuf/proto"
        types "github.com/gogo/protobuf/types"
)

// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf

// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package

type MetricType int32

const (
        // COUNTER must use the Metric field "counter".
        MetricType_COUNTER MetricType = 0
        // GAUGE must use the Metric field "gauge".
        MetricType_GAUGE MetricType = 1
        // SUMMARY must use the Metric field "summary".
        MetricType_SUMMARY MetricType = 2
        // UNTYPED must use the Metric field "untyped".
        MetricType_UNTYPED MetricType = 3
        // HISTOGRAM must use the Metric field "histogram".
        MetricType_HISTOGRAM MetricType = 4
        // GAUGE_HISTOGRAM must use the Metric field "histogram".
        MetricType_GAUGE_HISTOGRAM MetricType = 5
)

var MetricType_name = map[int32]string{
        0: "COUNTER",
        1: "GAUGE",
        2: "SUMMARY",
        3: "UNTYPED",
        4: "HISTOGRAM",
        5: "GAUGE_HISTOGRAM",
}

var MetricType_value = map[string]int32{
        "COUNTER":         0,
        "GAUGE":           1,
        "SUMMARY":         2,
        "UNTYPED":         3,
        "HISTOGRAM":       4,
        "GAUGE_HISTOGRAM": 5,
}

func (x MetricType) String() string {
        return proto.EnumName(MetricType_name, int32(x))
}

func (MetricType) EnumDescriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{0}
}

type LabelPair struct {
        Name                 string   `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
        Value                string   `protobuf:"bytes,2,opt,name=value,proto3" json:"value,omitempty"`
        XXX_NoUnkeyedLiteral struct{} `json:"-"`
        XXX_unrecognized     []byte   `json:"-"`
        XXX_sizecache        int32    `json:"-"`
}

func (m *LabelPair) Reset()         { *m = LabelPair{} }
func (m *LabelPair) String() string { return proto.CompactTextString(m) }
func (*LabelPair) ProtoMessage()    {}
func (*LabelPair) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{0}
}
func (m *LabelPair) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *LabelPair) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_LabelPair.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *LabelPair) XXX_Merge(src proto.Message) {
        xxx_messageInfo_LabelPair.Merge(m, src)
}
func (m *LabelPair) XXX_Size() int {
        return m.Size()
}
func (m *LabelPair) XXX_DiscardUnknown() {
        xxx_messageInfo_LabelPair.DiscardUnknown(m)
}

var xxx_messageInfo_LabelPair proto.InternalMessageInfo

func (m *LabelPair) GetName() string {
        if m != nil {
                return m.Name
        }
        return ""
}

func (m *LabelPair) GetValue() string {
        if m != nil {
                return m.Value
        }
        return ""
}

type Gauge struct {
        Value                float64  `protobuf:"fixed64,1,opt,name=value,proto3" json:"value,omitempty"`
        XXX_NoUnkeyedLiteral struct{} `json:"-"`
        XXX_unrecognized     []byte   `json:"-"`
        XXX_sizecache        int32    `json:"-"`
}

func (m *Gauge) Reset()         { *m = Gauge{} }
func (m *Gauge) String() string { return proto.CompactTextString(m) }
func (*Gauge) ProtoMessage()    {}
func (*Gauge) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{1}
}
func (m *Gauge) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *Gauge) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_Gauge.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *Gauge) XXX_Merge(src proto.Message) {
        xxx_messageInfo_Gauge.Merge(m, src)
}
func (m *Gauge) XXX_Size() int {
        return m.Size()
}
func (m *Gauge) XXX_DiscardUnknown() {
        xxx_messageInfo_Gauge.DiscardUnknown(m)
}

var xxx_messageInfo_Gauge proto.InternalMessageInfo

func (m *Gauge) GetValue() float64 {
        if m != nil {
                return m.Value
        }
        return 0
}

type Counter struct {
        Value                float64          `protobuf:"fixed64,1,opt,name=value,proto3" json:"value,omitempty"`
        Exemplar             *Exemplar        `protobuf:"bytes,2,opt,name=exemplar,proto3" json:"exemplar,omitempty"`
        CreatedTimestamp     *types.Timestamp `protobuf:"bytes,3,opt,name=created_timestamp,json=createdTimestamp,proto3" json:"created_timestamp,omitempty"`
        XXX_NoUnkeyedLiteral struct{}         `json:"-"`
        XXX_unrecognized     []byte           `json:"-"`
        XXX_sizecache        int32            `json:"-"`
}

func (m *Counter) Reset()         { *m = Counter{} }
func (m *Counter) String() string { return proto.CompactTextString(m) }
func (*Counter) ProtoMessage()    {}
func (*Counter) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{2}
}
func (m *Counter) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *Counter) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_Counter.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *Counter) XXX_Merge(src proto.Message) {
        xxx_messageInfo_Counter.Merge(m, src)
}
func (m *Counter) XXX_Size() int {
        return m.Size()
}
func (m *Counter) XXX_DiscardUnknown() {
        xxx_messageInfo_Counter.DiscardUnknown(m)
}

var xxx_messageInfo_Counter proto.InternalMessageInfo

func (m *Counter) GetValue() float64 {
        if m != nil {
                return m.Value
        }
        return 0
}

func (m *Counter) GetExemplar() *Exemplar {
        if m != nil {
                return m.Exemplar
        }
        return nil
}

func (m *Counter) GetCreatedTimestamp() *types.Timestamp {
        if m != nil {
                return m.CreatedTimestamp
        }
        return nil
}

type Quantile struct {
        Quantile             float64  `protobuf:"fixed64,1,opt,name=quantile,proto3" json:"quantile,omitempty"`
        Value                float64  `protobuf:"fixed64,2,opt,name=value,proto3" json:"value,omitempty"`
        XXX_NoUnkeyedLiteral struct{} `json:"-"`
        XXX_unrecognized     []byte   `json:"-"`
        XXX_sizecache        int32    `json:"-"`
}

func (m *Quantile) Reset()         { *m = Quantile{} }
func (m *Quantile) String() string { return proto.CompactTextString(m) }
func (*Quantile) ProtoMessage()    {}
func (*Quantile) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{3}
}
func (m *Quantile) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *Quantile) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_Quantile.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *Quantile) XXX_Merge(src proto.Message) {
        xxx_messageInfo_Quantile.Merge(m, src)
}
func (m *Quantile) XXX_Size() int {
        return m.Size()
}
func (m *Quantile) XXX_DiscardUnknown() {
        xxx_messageInfo_Quantile.DiscardUnknown(m)
}

var xxx_messageInfo_Quantile proto.InternalMessageInfo

func (m *Quantile) GetQuantile() float64 {
        if m != nil {
                return m.Quantile
        }
        return 0
}

func (m *Quantile) GetValue() float64 {
        if m != nil {
                return m.Value
        }
        return 0
}

type Summary struct {
        SampleCount          uint64           `protobuf:"varint,1,opt,name=sample_count,json=sampleCount,proto3" json:"sample_count,omitempty"`
        SampleSum            float64          `protobuf:"fixed64,2,opt,name=sample_sum,json=sampleSum,proto3" json:"sample_sum,omitempty"`
        Quantile             []Quantile       `protobuf:"bytes,3,rep,name=quantile,proto3" json:"quantile"`
        CreatedTimestamp     *types.Timestamp `protobuf:"bytes,4,opt,name=created_timestamp,json=createdTimestamp,proto3" json:"created_timestamp,omitempty"`
        XXX_NoUnkeyedLiteral struct{}         `json:"-"`
        XXX_unrecognized     []byte           `json:"-"`
        XXX_sizecache        int32            `json:"-"`
}

func (m *Summary) Reset()         { *m = Summary{} }
func (m *Summary) String() string { return proto.CompactTextString(m) }
func (*Summary) ProtoMessage()    {}
func (*Summary) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{4}
}
func (m *Summary) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *Summary) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_Summary.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *Summary) XXX_Merge(src proto.Message) {
        xxx_messageInfo_Summary.Merge(m, src)
}
func (m *Summary) XXX_Size() int {
        return m.Size()
}
func (m *Summary) XXX_DiscardUnknown() {
        xxx_messageInfo_Summary.DiscardUnknown(m)
}

var xxx_messageInfo_Summary proto.InternalMessageInfo

func (m *Summary) GetSampleCount() uint64 {
        if m != nil {
                return m.SampleCount
        }
        return 0
}

func (m *Summary) GetSampleSum() float64 {
        if m != nil {
                return m.SampleSum
        }
        return 0
}

func (m *Summary) GetQuantile() []Quantile {
        if m != nil {
                return m.Quantile
        }
        return nil
}

func (m *Summary) GetCreatedTimestamp() *types.Timestamp {
        if m != nil {
                return m.CreatedTimestamp
        }
        return nil
}

type Untyped struct {
        Value                float64  `protobuf:"fixed64,1,opt,name=value,proto3" json:"value,omitempty"`
        XXX_NoUnkeyedLiteral struct{} `json:"-"`
        XXX_unrecognized     []byte   `json:"-"`
        XXX_sizecache        int32    `json:"-"`
}

func (m *Untyped) Reset()         { *m = Untyped{} }
func (m *Untyped) String() string { return proto.CompactTextString(m) }
func (*Untyped) ProtoMessage()    {}
func (*Untyped) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{5}
}
func (m *Untyped) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *Untyped) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_Untyped.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *Untyped) XXX_Merge(src proto.Message) {
        xxx_messageInfo_Untyped.Merge(m, src)
}
func (m *Untyped) XXX_Size() int {
        return m.Size()
}
func (m *Untyped) XXX_DiscardUnknown() {
        xxx_messageInfo_Untyped.DiscardUnknown(m)
}

var xxx_messageInfo_Untyped proto.InternalMessageInfo

func (m *Untyped) GetValue() float64 {
        if m != nil {
                return m.Value
        }
        return 0
}

type Histogram struct {
        SampleCount      uint64  `protobuf:"varint,1,opt,name=sample_count,json=sampleCount,proto3" json:"sample_count,omitempty"`
        SampleCountFloat float64 `protobuf:"fixed64,4,opt,name=sample_count_float,json=sampleCountFloat,proto3" json:"sample_count_float,omitempty"`
        SampleSum        float64 `protobuf:"fixed64,2,opt,name=sample_sum,json=sampleSum,proto3" json:"sample_sum,omitempty"`
        // Buckets for the classic histogram.
        Bucket           []Bucket         `protobuf:"bytes,3,rep,name=bucket,proto3" json:"bucket"`
        CreatedTimestamp *types.Timestamp `protobuf:"bytes,15,opt,name=created_timestamp,json=createdTimestamp,proto3" json:"created_timestamp,omitempty"`
        // schema defines the bucket schema. Currently, valid numbers are -4 <= n <= 8.
        // They are all for base-2 bucket schemas, where 1 is a bucket boundary in each case, and
        // then each power of two is divided into 2^n logarithmic buckets.
        // Or in other words, each bucket boundary is the previous boundary times 2^(2^-n).
        // In the future, more bucket schemas may be added using numbers < -4 or > 8.
        Schema         int32   `protobuf:"zigzag32,5,opt,name=schema,proto3" json:"schema,omitempty"`
        ZeroThreshold  float64 `protobuf:"fixed64,6,opt,name=zero_threshold,json=zeroThreshold,proto3" json:"zero_threshold,omitempty"`
        ZeroCount      uint64  `protobuf:"varint,7,opt,name=zero_count,json=zeroCount,proto3" json:"zero_count,omitempty"`
        ZeroCountFloat float64 `protobuf:"fixed64,8,opt,name=zero_count_float,json=zeroCountFloat,proto3" json:"zero_count_float,omitempty"`
        // Negative buckets for the native histogram.
        NegativeSpan []BucketSpan `protobuf:"bytes,9,rep,name=negative_span,json=negativeSpan,proto3" json:"negative_span"`
        // Use either "negative_delta" or "negative_count", the former for
        // regular histograms with integer counts, the latter for float
        // histograms.
        NegativeDelta []int64   `protobuf:"zigzag64,10,rep,packed,name=negative_delta,json=negativeDelta,proto3" json:"negative_delta,omitempty"`
        NegativeCount []float64 `protobuf:"fixed64,11,rep,packed,name=negative_count,json=negativeCount,proto3" json:"negative_count,omitempty"`
        // Positive buckets for the native histogram.
        // Use a no-op span (offset 0, length 0) for a native histogram without any
        // observations yet and with a zero_threshold of 0. Otherwise, it would be
        // indistinguishable from a classic histogram.
        PositiveSpan []BucketSpan `protobuf:"bytes,12,rep,name=positive_span,json=positiveSpan,proto3" json:"positive_span"`
        // Use either "positive_delta" or "positive_count", the former for
        // regular histograms with integer counts, the latter for float
        // histograms.
        PositiveDelta []int64   `protobuf:"zigzag64,13,rep,packed,name=positive_delta,json=positiveDelta,proto3" json:"positive_delta,omitempty"`
        PositiveCount []float64 `protobuf:"fixed64,14,rep,packed,name=positive_count,json=positiveCount,proto3" json:"positive_count,omitempty"`
        // Only used for native histograms. These exemplars MUST have a timestamp.
        Exemplars            []*Exemplar `protobuf:"bytes,16,rep,name=exemplars,proto3" json:"exemplars,omitempty"`
        XXX_NoUnkeyedLiteral struct{}    `json:"-"`
        XXX_unrecognized     []byte      `json:"-"`
        XXX_sizecache        int32       `json:"-"`
}

func (m *Histogram) Reset()         { *m = Histogram{} }
func (m *Histogram) String() string { return proto.CompactTextString(m) }
func (*Histogram) ProtoMessage()    {}
func (*Histogram) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{6}
}
func (m *Histogram) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *Histogram) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_Histogram.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *Histogram) XXX_Merge(src proto.Message) {
        xxx_messageInfo_Histogram.Merge(m, src)
}
func (m *Histogram) XXX_Size() int {
        return m.Size()
}
func (m *Histogram) XXX_DiscardUnknown() {
        xxx_messageInfo_Histogram.DiscardUnknown(m)
}

var xxx_messageInfo_Histogram proto.InternalMessageInfo

func (m *Histogram) GetSampleCount() uint64 {
        if m != nil {
                return m.SampleCount
        }
        return 0
}

func (m *Histogram) GetSampleCountFloat() float64 {
        if m != nil {
                return m.SampleCountFloat
        }
        return 0
}

func (m *Histogram) GetSampleSum() float64 {
        if m != nil {
                return m.SampleSum
        }
        return 0
}

func (m *Histogram) GetBucket() []Bucket {
        if m != nil {
                return m.Bucket
        }
        return nil
}

func (m *Histogram) GetCreatedTimestamp() *types.Timestamp {
        if m != nil {
                return m.CreatedTimestamp
        }
        return nil
}

func (m *Histogram) GetSchema() int32 {
        if m != nil {
                return m.Schema
        }
        return 0
}

func (m *Histogram) GetZeroThreshold() float64 {
        if m != nil {
                return m.ZeroThreshold
        }
        return 0
}

func (m *Histogram) GetZeroCount() uint64 {
        if m != nil {
                return m.ZeroCount
        }
        return 0
}

func (m *Histogram) GetZeroCountFloat() float64 {
        if m != nil {
                return m.ZeroCountFloat
        }
        return 0
}

func (m *Histogram) GetNegativeSpan() []BucketSpan {
        if m != nil {
                return m.NegativeSpan
        }
        return nil
}

func (m *Histogram) GetNegativeDelta() []int64 {
        if m != nil {
                return m.NegativeDelta
        }
        return nil
}

func (m *Histogram) GetNegativeCount() []float64 {
        if m != nil {
                return m.NegativeCount
        }
        return nil
}

func (m *Histogram) GetPositiveSpan() []BucketSpan {
        if m != nil {
                return m.PositiveSpan
        }
        return nil
}

func (m *Histogram) GetPositiveDelta() []int64 {
        if m != nil {
                return m.PositiveDelta
        }
        return nil
}

func (m *Histogram) GetPositiveCount() []float64 {
        if m != nil {
                return m.PositiveCount
        }
        return nil
}

func (m *Histogram) GetExemplars() []*Exemplar {
        if m != nil {
                return m.Exemplars
        }
        return nil
}

type Bucket struct {
        CumulativeCount      uint64    `protobuf:"varint,1,opt,name=cumulative_count,json=cumulativeCount,proto3" json:"cumulative_count,omitempty"`
        CumulativeCountFloat float64   `protobuf:"fixed64,4,opt,name=cumulative_count_float,json=cumulativeCountFloat,proto3" json:"cumulative_count_float,omitempty"`
        UpperBound           float64   `protobuf:"fixed64,2,opt,name=upper_bound,json=upperBound,proto3" json:"upper_bound,omitempty"`
        Exemplar             *Exemplar `protobuf:"bytes,3,opt,name=exemplar,proto3" json:"exemplar,omitempty"`
        XXX_NoUnkeyedLiteral struct{}  `json:"-"`
        XXX_unrecognized     []byte    `json:"-"`
        XXX_sizecache        int32     `json:"-"`
}

func (m *Bucket) Reset()         { *m = Bucket{} }
func (m *Bucket) String() string { return proto.CompactTextString(m) }
func (*Bucket) ProtoMessage()    {}
func (*Bucket) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{7}
}
func (m *Bucket) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *Bucket) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_Bucket.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *Bucket) XXX_Merge(src proto.Message) {
        xxx_messageInfo_Bucket.Merge(m, src)
}
func (m *Bucket) XXX_Size() int {
        return m.Size()
}
func (m *Bucket) XXX_DiscardUnknown() {
        xxx_messageInfo_Bucket.DiscardUnknown(m)
}

var xxx_messageInfo_Bucket proto.InternalMessageInfo

func (m *Bucket) GetCumulativeCount() uint64 {
        if m != nil {
                return m.CumulativeCount
        }
        return 0
}

func (m *Bucket) GetCumulativeCountFloat() float64 {
        if m != nil {
                return m.CumulativeCountFloat
        }
        return 0
}

func (m *Bucket) GetUpperBound() float64 {
        if m != nil {
                return m.UpperBound
        }
        return 0
}

func (m *Bucket) GetExemplar() *Exemplar {
        if m != nil {
                return m.Exemplar
        }
        return nil
}

// A BucketSpan defines a number of consecutive buckets in a native
// histogram with their offset. Logically, it would be more
// straightforward to include the bucket counts in the Span. However,
// the protobuf representation is more compact in the way the data is
// structured here (with all the buckets in a single array separate
// from the Spans).
type BucketSpan struct {
        Offset               int32    `protobuf:"zigzag32,1,opt,name=offset,proto3" json:"offset,omitempty"`
        Length               uint32   `protobuf:"varint,2,opt,name=length,proto3" json:"length,omitempty"`
        XXX_NoUnkeyedLiteral struct{} `json:"-"`
        XXX_unrecognized     []byte   `json:"-"`
        XXX_sizecache        int32    `json:"-"`
}

func (m *BucketSpan) Reset()         { *m = BucketSpan{} }
func (m *BucketSpan) String() string { return proto.CompactTextString(m) }
func (*BucketSpan) ProtoMessage()    {}
func (*BucketSpan) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{8}
}
func (m *BucketSpan) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *BucketSpan) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_BucketSpan.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *BucketSpan) XXX_Merge(src proto.Message) {
        xxx_messageInfo_BucketSpan.Merge(m, src)
}
func (m *BucketSpan) XXX_Size() int {
        return m.Size()
}
func (m *BucketSpan) XXX_DiscardUnknown() {
        xxx_messageInfo_BucketSpan.DiscardUnknown(m)
}

var xxx_messageInfo_BucketSpan proto.InternalMessageInfo

func (m *BucketSpan) GetOffset() int32 {
        if m != nil {
                return m.Offset
        }
        return 0
}

func (m *BucketSpan) GetLength() uint32 {
        if m != nil {
                return m.Length
        }
        return 0
}

type Exemplar struct {
        Label                []LabelPair      `protobuf:"bytes,1,rep,name=label,proto3" json:"label"`
        Value                float64          `protobuf:"fixed64,2,opt,name=value,proto3" json:"value,omitempty"`
        Timestamp            *types.Timestamp `protobuf:"bytes,3,opt,name=timestamp,proto3" json:"timestamp,omitempty"`
        XXX_NoUnkeyedLiteral struct{}         `json:"-"`
        XXX_unrecognized     []byte           `json:"-"`
        XXX_sizecache        int32            `json:"-"`
}

func (m *Exemplar) Reset()         { *m = Exemplar{} }
func (m *Exemplar) String() string { return proto.CompactTextString(m) }
func (*Exemplar) ProtoMessage()    {}
func (*Exemplar) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{9}
}
func (m *Exemplar) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *Exemplar) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_Exemplar.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *Exemplar) XXX_Merge(src proto.Message) {
        xxx_messageInfo_Exemplar.Merge(m, src)
}
func (m *Exemplar) XXX_Size() int {
        return m.Size()
}
func (m *Exemplar) XXX_DiscardUnknown() {
        xxx_messageInfo_Exemplar.DiscardUnknown(m)
}

var xxx_messageInfo_Exemplar proto.InternalMessageInfo

func (m *Exemplar) GetLabel() []LabelPair {
        if m != nil {
                return m.Label
        }
        return nil
}

func (m *Exemplar) GetValue() float64 {
        if m != nil {
                return m.Value
        }
        return 0
}

func (m *Exemplar) GetTimestamp() *types.Timestamp {
        if m != nil {
                return m.Timestamp
        }
        return nil
}

type Metric struct {
        Label                []LabelPair `protobuf:"bytes,1,rep,name=label,proto3" json:"label"`
        Gauge                *Gauge      `protobuf:"bytes,2,opt,name=gauge,proto3" json:"gauge,omitempty"`
        Counter              *Counter    `protobuf:"bytes,3,opt,name=counter,proto3" json:"counter,omitempty"`
        Summary              *Summary    `protobuf:"bytes,4,opt,name=summary,proto3" json:"summary,omitempty"`
        Untyped              *Untyped    `protobuf:"bytes,5,opt,name=untyped,proto3" json:"untyped,omitempty"`
        Histogram            *Histogram  `protobuf:"bytes,7,opt,name=histogram,proto3" json:"histogram,omitempty"`
        TimestampMs          int64       `protobuf:"varint,6,opt,name=timestamp_ms,json=timestampMs,proto3" json:"timestamp_ms,omitempty"`
        XXX_NoUnkeyedLiteral struct{}    `json:"-"`
        XXX_unrecognized     []byte      `json:"-"`
        XXX_sizecache        int32       `json:"-"`
}

func (m *Metric) Reset()         { *m = Metric{} }
func (m *Metric) String() string { return proto.CompactTextString(m) }
func (*Metric) ProtoMessage()    {}
func (*Metric) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{10}
}
func (m *Metric) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *Metric) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_Metric.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *Metric) XXX_Merge(src proto.Message) {
        xxx_messageInfo_Metric.Merge(m, src)
}
func (m *Metric) XXX_Size() int {
        return m.Size()
}
func (m *Metric) XXX_DiscardUnknown() {
        xxx_messageInfo_Metric.DiscardUnknown(m)
}

var xxx_messageInfo_Metric proto.InternalMessageInfo

func (m *Metric) GetLabel() []LabelPair {
        if m != nil {
                return m.Label
        }
        return nil
}

func (m *Metric) GetGauge() *Gauge {
        if m != nil {
                return m.Gauge
        }
        return nil
}

func (m *Metric) GetCounter() *Counter {
        if m != nil {
                return m.Counter
        }
        return nil
}

func (m *Metric) GetSummary() *Summary {
        if m != nil {
                return m.Summary
        }
        return nil
}

func (m *Metric) GetUntyped() *Untyped {
        if m != nil {
                return m.Untyped
        }
        return nil
}

func (m *Metric) GetHistogram() *Histogram {
        if m != nil {
                return m.Histogram
        }
        return nil
}

func (m *Metric) GetTimestampMs() int64 {
        if m != nil {
                return m.TimestampMs
        }
        return 0
}

type MetricFamily struct {
        Name                 string     `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
        Help                 string     `protobuf:"bytes,2,opt,name=help,proto3" json:"help,omitempty"`
        Type                 MetricType `protobuf:"varint,3,opt,name=type,proto3,enum=io.prometheus.client.MetricType" json:"type,omitempty"`
        Metric               []Metric   `protobuf:"bytes,4,rep,name=metric,proto3" json:"metric"`
        Unit                 string     `protobuf:"bytes,5,opt,name=unit,proto3" json:"unit,omitempty"`
        XXX_NoUnkeyedLiteral struct{}   `json:"-"`
        XXX_unrecognized     []byte     `json:"-"`
        XXX_sizecache        int32      `json:"-"`
}

func (m *MetricFamily) Reset()         { *m = MetricFamily{} }
func (m *MetricFamily) String() string { return proto.CompactTextString(m) }
func (*MetricFamily) ProtoMessage()    {}
func (*MetricFamily) Descriptor() ([]byte, []int) {
        return fileDescriptor_d1e5ddb18987a258, []int{11}
}
func (m *MetricFamily) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *MetricFamily) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_MetricFamily.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *MetricFamily) XXX_Merge(src proto.Message) {
        xxx_messageInfo_MetricFamily.Merge(m, src)
}
func (m *MetricFamily) XXX_Size() int {
        return m.Size()
}
func (m *MetricFamily) XXX_DiscardUnknown() {
        xxx_messageInfo_MetricFamily.DiscardUnknown(m)
}

var xxx_messageInfo_MetricFamily proto.InternalMessageInfo

func (m *MetricFamily) GetName() string {
        if m != nil {
                return m.Name
        }
        return ""
}

func (m *MetricFamily) GetHelp() string {
        if m != nil {
                return m.Help
        }
        return ""
}

func (m *MetricFamily) GetType() MetricType {
        if m != nil {
                return m.Type
        }
        return MetricType_COUNTER
}

func (m *MetricFamily) GetMetric() []Metric {
        if m != nil {
                return m.Metric
        }
        return nil
}

func (m *MetricFamily) GetUnit() string {
        if m != nil {
                return m.Unit
        }
        return ""
}

func init() {
        proto.RegisterEnum("io.prometheus.client.MetricType", MetricType_name, MetricType_value)
        proto.RegisterType((*LabelPair)(nil), "io.prometheus.client.LabelPair")
        proto.RegisterType((*Gauge)(nil), "io.prometheus.client.Gauge")
        proto.RegisterType((*Counter)(nil), "io.prometheus.client.Counter")
        proto.RegisterType((*Quantile)(nil), "io.prometheus.client.Quantile")
        proto.RegisterType((*Summary)(nil), "io.prometheus.client.Summary")
        proto.RegisterType((*Untyped)(nil), "io.prometheus.client.Untyped")
        proto.RegisterType((*Histogram)(nil), "io.prometheus.client.Histogram")
        proto.RegisterType((*Bucket)(nil), "io.prometheus.client.Bucket")
        proto.RegisterType((*BucketSpan)(nil), "io.prometheus.client.BucketSpan")
        proto.RegisterType((*Exemplar)(nil), "io.prometheus.client.Exemplar")
        proto.RegisterType((*Metric)(nil), "io.prometheus.client.Metric")
        proto.RegisterType((*MetricFamily)(nil), "io.prometheus.client.MetricFamily")
}

func init() {
        proto.RegisterFile("io/prometheus/client/metrics.proto", fileDescriptor_d1e5ddb18987a258)
}

var fileDescriptor_d1e5ddb18987a258 = []byte{
        // 982 bytes of a gzipped FileDescriptorProto
        0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xa4, 0x56, 0x4d, 0x8f, 0xdb, 0x44,
        0x18, 0xae, 0x9b, 0x4f, 0xbf, 0xd9, 0x6c, 0xbd, 0x43, 0x54, 0x59, 0x0b, 0xbb, 0x09, 0x96, 0x90,
        0x16, 0x84, 0x12, 0x01, 0x45, 0xa0, 0xb2, 0x48, 0xec, 0xb6, 0xdb, 0x14, 0x95, 0xb4, 0x65, 0x92,
        0x1c, 0xca, 0xc5, 0x9a, 0x24, 0xb3, 0x8e, 0x85, 0xbf, 0xb0, 0xc7, 0x15, 0xcb, 0x9d, 0xdf, 0xc0,
        0x1f, 0xe0, 0x67, 0x70, 0x46, 0x3d, 0x72, 0xe2, 0x88, 0xd0, 0xfe, 0x0e, 0x0e, 0x68, 0xbe, 0xec,
        0x6c, 0xe5, 0x2c, 0x2c, 0xdc, 0x3c, 0x8f, 0x9f, 0x67, 0xe6, 0x79, 0x1f, 0xdb, 0xef, 0x6b, 0x70,
        0xfc, 0x78, 0x94, 0xa4, 0x71, 0x48, 0xd9, 0x9a, 0xe6, 0xd9, 0x68, 0x19, 0xf8, 0x34, 0x62, 0xa3,
        0x90, 0xb2, 0xd4, 0x5f, 0x66, 0xc3, 0x24, 0x8d, 0x59, 0x8c, 0x7a, 0x7e, 0x3c, 0x2c, 0x39, 0x43,
        0xc9, 0xd9, 0xef, 0x79, 0xb1, 0x17, 0x0b, 0xc2, 0x88, 0x5f, 0x49, 0xee, 0x7e, 0xdf, 0x8b, 0x63,
        0x2f, 0xa0, 0x23, 0xb1, 0x5a, 0xe4, 0xe7, 0x23, 0xe6, 0x87, 0x34, 0x63, 0x24, 0x4c, 0x24, 0xc1,
        0xf9, 0x18, 0xcc, 0xaf, 0xc8, 0x82, 0x06, 0xcf, 0x89, 0x9f, 0x22, 0x04, 0xf5, 0x88, 0x84, 0xd4,
        0x36, 0x06, 0xc6, 0x91, 0x89, 0xc5, 0x35, 0xea, 0x41, 0xe3, 0x25, 0x09, 0x72, 0x6a, 0xdf, 0x16,
        0xa0, 0x5c, 0x38, 0x07, 0xd0, 0x18, 0x93, 0xdc, 0xdb, 0xb8, 0xcd, 0x35, 0x86, 0xbe, 0xfd, 0xb3,
        0x01, 0xad, 0x07, 0x71, 0x1e, 0x31, 0x9a, 0x56, 0x33, 0xd0, 0x7d, 0x68, 0xd3, 0xef, 0x69, 0x98,
        0x04, 0x24, 0x15, 0x3b, 0x77, 0x3e, 0x3c, 0x1c, 0x56, 0xd5, 0x35, 0x3c, 0x53, 0x2c, 0x5c, 0xf0,
        0xd1, 0x18, 0xf6, 0x96, 0x29, 0x25, 0x8c, 0xae, 0xdc, 0xa2, 0x1c, 0xbb, 0x26, 0x36, 0xd9, 0x1f,
        0xca, 0x82, 0x87, 0xba, 0xe0, 0xe1, 0x4c, 0x33, 0xb0, 0xa5, 0x44, 0x05, 0xe2, 0x1c, 0x43, 0xfb,
        0xeb, 0x9c, 0x44, 0xcc, 0x0f, 0x28, 0xda, 0x87, 0xf6, 0x77, 0xea, 0x5a, 0x39, 0x2d, 0xd6, 0x57,
        0x33, 0x28, 0x8a, 0xfc, 0xdd, 0x80, 0xd6, 0x34, 0x0f, 0x43, 0x92, 0x5e, 0xa0, 0xb7, 0x61, 0x27,
        0x23, 0x61, 0x12, 0x50, 0x77, 0xc9, 0xcb, 0x16, 0x3b, 0xd4, 0x71, 0x47, 0x62, 0x22, 0x09, 0x74,
        0x00, 0xa0, 0x28, 0x59, 0x1e, 0xaa, 0x9d, 0x4c, 0x89, 0x4c, 0xf3, 0x10, 0x7d, 0xb1, 0x71, 0x7e,
        0x6d, 0x50, 0xdb, 0x1e, 0x88, 0x76, 0x7c, 0x5a, 0x7f, 0xf5, 0x47, 0xff, 0xd6, 0x86, 0xcb, 0xca,
        0x58, 0xea, 0xff, 0x21, 0x96, 0x3e, 0xb4, 0xe6, 0x11, 0xbb, 0x48, 0xe8, 0x6a, 0xcb, 0xe3, 0xfd,
        0xab, 0x01, 0xe6, 0x63, 0x3f, 0x63, 0xb1, 0x97, 0x92, 0xf0, 0xdf, 0xd4, 0xfe, 0x3e, 0xa0, 0x4d,
        0x8a, 0x7b, 0x1e, 0xc4, 0x84, 0x09, 0x6f, 0x06, 0xb6, 0x36, 0x88, 0x8f, 0x38, 0xfe, 0x4f, 0x49,
        0xdd, 0x87, 0xe6, 0x22, 0x5f, 0x7e, 0x4b, 0x99, 0xca, 0xe9, 0xad, 0xea, 0x9c, 0x4e, 0x05, 0x47,
        0xa5, 0xa4, 0x14, 0xd5, 0x19, 0xdd, 0xb9, 0x79, 0x46, 0xe8, 0x2e, 0x34, 0xb3, 0xe5, 0x9a, 0x86,
        0xc4, 0x6e, 0x0c, 0x8c, 0xa3, 0x3d, 0xac, 0x56, 0xe8, 0x1d, 0xd8, 0xfd, 0x81, 0xa6, 0xb1, 0xcb,
        0xd6, 0x29, 0xcd, 0xd6, 0x71, 0xb0, 0xb2, 0x9b, 0xc2, 0x7f, 0x97, 0xa3, 0x33, 0x0d, 0xf2, 0x12,
        0x05, 0x4d, 0x26, 0xd6, 0x12, 0x89, 0x99, 0x1c, 0x91, 0x79, 0x1d, 0x81, 0x55, 0xde, 0x56, 0x69,
        0xb5, 0xc5, 0x3e, 0xbb, 0x05, 0x49, 0x66, 0xf5, 0x04, 0xba, 0x11, 0xf5, 0x08, 0xf3, 0x5f, 0x52,
        0x37, 0x4b, 0x48, 0x64, 0x9b, 0x22, 0x93, 0xc1, 0x75, 0x99, 0x4c, 0x13, 0x12, 0xa9, 0x5c, 0x76,
        0xb4, 0x98, 0x63, 0xdc, 0x7c, 0xb1, 0xd9, 0x8a, 0x06, 0x8c, 0xd8, 0x30, 0xa8, 0x1d, 0x21, 0x5c,
        0x1c, 0xf1, 0x90, 0x83, 0x57, 0x68, 0xb2, 0x80, 0xce, 0xa0, 0xc6, 0x6b, 0xd4, 0xa8, 0x2c, 0xe2,
        0x09, 0x74, 0x93, 0x38, 0xf3, 0x4b, 0x6b, 0x3b, 0x37, 0xb3, 0xa6, 0xc5, 0xda, 0x5a, 0xb1, 0x99,
        0xb4, 0xd6, 0x95, 0xd6, 0x34, 0x5a, 0x58, 0x2b, 0x68, 0xd2, 0xda, 0xae, 0xb4, 0xa6, 0x51, 0x69,
        0xed, 0x18, 0x4c, 0xdd, 0x4d, 0x32, 0xdb, 0xba, 0xee, 0x6b, 0x2b, 0xda, 0x4f, 0x29, 0x70, 0x7e,
        0x35, 0xa0, 0x29, 0xed, 0xa2, 0x77, 0xc1, 0x5a, 0xe6, 0x61, 0x1e, 0x6c, 0x86, 0x21, 0xdf, 0xff,
        0x3b, 0x25, 0x2e, 0xcf, 0xbc, 0x07, 0x77, 0x5f, 0xa7, 0x5e, 0xf9, 0x0e, 0x7a, 0xaf, 0x09, 0xe4,
        0xf3, 0xed, 0x43, 0x27, 0x4f, 0x12, 0x9a, 0xba, 0x8b, 0x38, 0x8f, 0x56, 0xea, 0x63, 0x00, 0x01,
        0x9d, 0x72, 0xe4, 0x4a, 0x23, 0xad, 0xdd, 0xac, 0x91, 0x3a, 0xc7, 0x00, 0x65, 0xec, 0xfc, 0x95,
        0x8e, 0xcf, 0xcf, 0x33, 0x2a, 0x2b, 0xd8, 0xc3, 0x6a, 0xc5, 0xf1, 0x80, 0x46, 0x1e, 0x5b, 0x8b,
        0xd3, 0xbb, 0x58, 0xad, 0x9c, 0x9f, 0x0c, 0x68, 0xeb, 0x4d, 0xd1, 0x67, 0xd0, 0x08, 0xf8, 0x1c,
        0xb1, 0x0d, 0x91, 0x66, 0xbf, 0xda, 0x43, 0x31, 0x6a, 0xd4, 0x33, 0x96, 0x9a, 0xea, 0xfe, 0x8a,
        0x3e, 0x05, 0xf3, 0x26, 0xed, 0xbd, 0x24, 0x3b, 0x3f, 0xd6, 0xa0, 0x39, 0x11, 0x33, 0xf3, 0xff,
        0xf9, 0xfa, 0x00, 0x1a, 0x1e, 0x9f, 0x72, 0x6a, 0x42, 0xbd, 0x59, 0x2d, 0x16, 0x83, 0x10, 0x4b,
        0x26, 0xfa, 0x04, 0x5a, 0x4b, 0x39, 0xf8, 0x94, 0xe5, 0x83, 0x6a, 0x91, 0x9a, 0x8e, 0x58, 0xb3,
        0xb9, 0x30, 0x93, 0xc3, 0x44, 0xf5, 0xec, 0x2d, 0x42, 0x35, 0x71, 0xb0, 0x66, 0x73, 0x61, 0x2e,
        0xbb, 0xb5, 0x68, 0x45, 0x5b, 0x85, 0xaa, 0xa5, 0x63, 0xcd, 0x46, 0x9f, 0x83, 0xb9, 0xd6, 0x4d,
        0x5c, 0xb4, 0xa0, 0xad, 0xf1, 0x14, 0xbd, 0x1e, 0x97, 0x0a, 0xde, 0xf6, 0x8b, 0xc4, 0xdd, 0x30,
        0x13, 0x7d, 0xae, 0x86, 0x3b, 0x05, 0x36, 0xc9, 0x9c, 0x5f, 0x0c, 0xd8, 0x91, 0xcf, 0xe1, 0x11,
        0x09, 0xfd, 0xe0, 0xa2, 0xf2, 0x07, 0x03, 0x41, 0x7d, 0x4d, 0x83, 0x44, 0xfd, 0x5f, 0x88, 0x6b,
        0x74, 0x0f, 0xea, 0xdc, 0xa3, 0x88, 0x70, 0x77, 0x5b, 0xc7, 0x90, 0x3b, 0xcf, 0x2e, 0x12, 0x8a,
        0x05, 0x9b, 0x0f, 0x06, 0xf9, 0xa7, 0x64, 0xd7, 0xaf, 0x1b, 0x0c, 0x52, 0xa7, 0x07, 0x83, 0x54,
        0x70, 0x17, 0x79, 0xe4, 0x33, 0x11, 0xa1, 0x89, 0xc5, 0xf5, 0x7b, 0x0b, 0x80, 0xf2, 0x0c, 0xd4,
        0x81, 0xd6, 0x83, 0x67, 0xf3, 0xa7, 0xb3, 0x33, 0x6c, 0xdd, 0x42, 0x26, 0x34, 0xc6, 0x27, 0xf3,
        0xf1, 0x99, 0x65, 0x70, 0x7c, 0x3a, 0x9f, 0x4c, 0x4e, 0xf0, 0x0b, 0xeb, 0x36, 0x5f, 0xcc, 0x9f,
        0xce, 0x5e, 0x3c, 0x3f, 0x7b, 0x68, 0xd5, 0x50, 0x17, 0xcc, 0xc7, 0x5f, 0x4e, 0x67, 0xcf, 0xc6,
        0xf8, 0x64, 0x62, 0xd5, 0xd1, 0x1b, 0x70, 0x47, 0x68, 0xdc, 0x12, 0x6c, 0x9c, 0x3a, 0xaf, 0x2e,
        0x0f, 0x8d, 0xdf, 0x2e, 0x0f, 0x8d, 0x3f, 0x2f, 0x0f, 0x8d, 0x6f, 0x7a, 0x7e, 0xec, 0x96, 0x86,
        0x5d, 0x69, 0x78, 0xd1, 0x14, 0x6f, 0xfb, 0x47, 0x7f, 0x07, 0x00, 0x00, 0xff, 0xff, 0x1c, 0xe1,
        0xcf, 0xb8, 0x1d, 0x0a, 0x00, 0x00,
}

func (m *LabelPair) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *LabelPair) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *LabelPair) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if len(m.Value) > 0 {
                i -= len(m.Value)
                copy(dAtA[i:], m.Value)
                i = encodeVarintMetrics(dAtA, i, uint64(len(m.Value)))
                i--
                dAtA[i] = 0x12
        }
        if len(m.Name) > 0 {
                i -= len(m.Name)
                copy(dAtA[i:], m.Name)
                i = encodeVarintMetrics(dAtA, i, uint64(len(m.Name)))
                i--
                dAtA[i] = 0xa
        }
        return len(dAtA) - i, nil
}

func (m *Gauge) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *Gauge) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *Gauge) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if m.Value != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.Value))))
                i--
                dAtA[i] = 0x9
        }
        return len(dAtA) - i, nil
}

func (m *Counter) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *Counter) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *Counter) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if m.CreatedTimestamp != nil {
                {
                        size, err := m.CreatedTimestamp.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x1a
        }
        if m.Exemplar != nil {
                {
                        size, err := m.Exemplar.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x12
        }
        if m.Value != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.Value))))
                i--
                dAtA[i] = 0x9
        }
        return len(dAtA) - i, nil
}

func (m *Quantile) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *Quantile) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *Quantile) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if m.Value != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.Value))))
                i--
                dAtA[i] = 0x11
        }
        if m.Quantile != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.Quantile))))
                i--
                dAtA[i] = 0x9
        }
        return len(dAtA) - i, nil
}

func (m *Summary) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *Summary) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *Summary) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if m.CreatedTimestamp != nil {
                {
                        size, err := m.CreatedTimestamp.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x22
        }
        if len(m.Quantile) > 0 {
                for iNdEx := len(m.Quantile) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.Quantile[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintMetrics(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0x1a
                }
        }
        if m.SampleSum != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.SampleSum))))
                i--
                dAtA[i] = 0x11
        }
        if m.SampleCount != 0 {
                i = encodeVarintMetrics(dAtA, i, uint64(m.SampleCount))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *Untyped) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *Untyped) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *Untyped) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if m.Value != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.Value))))
                i--
                dAtA[i] = 0x9
        }
        return len(dAtA) - i, nil
}

func (m *Histogram) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *Histogram) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *Histogram) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if len(m.Exemplars) > 0 {
                for iNdEx := len(m.Exemplars) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.Exemplars[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintMetrics(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0x1
                        i--
                        dAtA[i] = 0x82
                }
        }
        if m.CreatedTimestamp != nil {
                {
                        size, err := m.CreatedTimestamp.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x7a
        }
        if len(m.PositiveCount) > 0 {
                for iNdEx := len(m.PositiveCount) - 1; iNdEx >= 0; iNdEx-- {
                        f5 := math.Float64bits(float64(m.PositiveCount[iNdEx]))
                        i -= 8
                        encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(f5))
                }
                i = encodeVarintMetrics(dAtA, i, uint64(len(m.PositiveCount)*8))
                i--
                dAtA[i] = 0x72
        }
        if len(m.PositiveDelta) > 0 {
                var j6 int
                dAtA8 := make([]byte, len(m.PositiveDelta)*10)
                for _, num := range m.PositiveDelta {
                        x7 := (uint64(num) << 1) ^ uint64((num >> 63))
                        for x7 >= 1<<7 {
                                dAtA8[j6] = uint8(uint64(x7)&0x7f | 0x80)
                                j6++
                                x7 >>= 7
                        }
                        dAtA8[j6] = uint8(x7)
                        j6++
                }
                i -= j6
                copy(dAtA[i:], dAtA8[:j6])
                i = encodeVarintMetrics(dAtA, i, uint64(j6))
                i--
                dAtA[i] = 0x6a
        }
        if len(m.PositiveSpan) > 0 {
                for iNdEx := len(m.PositiveSpan) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.PositiveSpan[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintMetrics(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0x62
                }
        }
        if len(m.NegativeCount) > 0 {
                for iNdEx := len(m.NegativeCount) - 1; iNdEx >= 0; iNdEx-- {
                        f9 := math.Float64bits(float64(m.NegativeCount[iNdEx]))
                        i -= 8
                        encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(f9))
                }
                i = encodeVarintMetrics(dAtA, i, uint64(len(m.NegativeCount)*8))
                i--
                dAtA[i] = 0x5a
        }
        if len(m.NegativeDelta) > 0 {
                var j10 int
                dAtA12 := make([]byte, len(m.NegativeDelta)*10)
                for _, num := range m.NegativeDelta {
                        x11 := (uint64(num) << 1) ^ uint64((num >> 63))
                        for x11 >= 1<<7 {
                                dAtA12[j10] = uint8(uint64(x11)&0x7f | 0x80)
                                j10++
                                x11 >>= 7
                        }
                        dAtA12[j10] = uint8(x11)
                        j10++
                }
                i -= j10
                copy(dAtA[i:], dAtA12[:j10])
                i = encodeVarintMetrics(dAtA, i, uint64(j10))
                i--
                dAtA[i] = 0x52
        }
        if len(m.NegativeSpan) > 0 {
                for iNdEx := len(m.NegativeSpan) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.NegativeSpan[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintMetrics(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0x4a
                }
        }
        if m.ZeroCountFloat != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.ZeroCountFloat))))
                i--
                dAtA[i] = 0x41
        }
        if m.ZeroCount != 0 {
                i = encodeVarintMetrics(dAtA, i, uint64(m.ZeroCount))
                i--
                dAtA[i] = 0x38
        }
        if m.ZeroThreshold != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.ZeroThreshold))))
                i--
                dAtA[i] = 0x31
        }
        if m.Schema != 0 {
                i = encodeVarintMetrics(dAtA, i, uint64((uint32(m.Schema)<<1)^uint32((m.Schema>>31))))
                i--
                dAtA[i] = 0x28
        }
        if m.SampleCountFloat != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.SampleCountFloat))))
                i--
                dAtA[i] = 0x21
        }
        if len(m.Bucket) > 0 {
                for iNdEx := len(m.Bucket) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.Bucket[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintMetrics(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0x1a
                }
        }
        if m.SampleSum != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.SampleSum))))
                i--
                dAtA[i] = 0x11
        }
        if m.SampleCount != 0 {
                i = encodeVarintMetrics(dAtA, i, uint64(m.SampleCount))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *Bucket) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *Bucket) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *Bucket) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if m.CumulativeCountFloat != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.CumulativeCountFloat))))
                i--
                dAtA[i] = 0x21
        }
        if m.Exemplar != nil {
                {
                        size, err := m.Exemplar.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x1a
        }
        if m.UpperBound != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.UpperBound))))
                i--
                dAtA[i] = 0x11
        }
        if m.CumulativeCount != 0 {
                i = encodeVarintMetrics(dAtA, i, uint64(m.CumulativeCount))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *BucketSpan) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *BucketSpan) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *BucketSpan) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if m.Length != 0 {
                i = encodeVarintMetrics(dAtA, i, uint64(m.Length))
                i--
                dAtA[i] = 0x10
        }
        if m.Offset != 0 {
                i = encodeVarintMetrics(dAtA, i, uint64((uint32(m.Offset)<<1)^uint32((m.Offset>>31))))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *Exemplar) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *Exemplar) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *Exemplar) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if m.Timestamp != nil {
                {
                        size, err := m.Timestamp.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x1a
        }
        if m.Value != 0 {
                i -= 8
                encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.Value))))
                i--
                dAtA[i] = 0x11
        }
        if len(m.Label) > 0 {
                for iNdEx := len(m.Label) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.Label[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintMetrics(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0xa
                }
        }
        return len(dAtA) - i, nil
}

func (m *Metric) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *Metric) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *Metric) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if m.Histogram != nil {
                {
                        size, err := m.Histogram.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x3a
        }
        if m.TimestampMs != 0 {
                i = encodeVarintMetrics(dAtA, i, uint64(m.TimestampMs))
                i--
                dAtA[i] = 0x30
        }
        if m.Untyped != nil {
                {
                        size, err := m.Untyped.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x2a
        }
        if m.Summary != nil {
                {
                        size, err := m.Summary.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x22
        }
        if m.Counter != nil {
                {
                        size, err := m.Counter.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x1a
        }
        if m.Gauge != nil {
                {
                        size, err := m.Gauge.MarshalToSizedBuffer(dAtA[:i])
                        if err != nil {
                                return 0, err
                        }
                        i -= size
                        i = encodeVarintMetrics(dAtA, i, uint64(size))
                }
                i--
                dAtA[i] = 0x12
        }
        if len(m.Label) > 0 {
                for iNdEx := len(m.Label) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.Label[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintMetrics(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0xa
                }
        }
        return len(dAtA) - i, nil
}

func (m *MetricFamily) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *MetricFamily) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *MetricFamily) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.XXX_unrecognized != nil {
                i -= len(m.XXX_unrecognized)
                copy(dAtA[i:], m.XXX_unrecognized)
        }
        if len(m.Unit) > 0 {
                i -= len(m.Unit)
                copy(dAtA[i:], m.Unit)
                i = encodeVarintMetrics(dAtA, i, uint64(len(m.Unit)))
                i--
                dAtA[i] = 0x2a
        }
        if len(m.Metric) > 0 {
                for iNdEx := len(m.Metric) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.Metric[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintMetrics(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0x22
                }
        }
        if m.Type != 0 {
                i = encodeVarintMetrics(dAtA, i, uint64(m.Type))
                i--
                dAtA[i] = 0x18
        }
        if len(m.Help) > 0 {
                i -= len(m.Help)
                copy(dAtA[i:], m.Help)
                i = encodeVarintMetrics(dAtA, i, uint64(len(m.Help)))
                i--
                dAtA[i] = 0x12
        }
        if len(m.Name) > 0 {
                i -= len(m.Name)
                copy(dAtA[i:], m.Name)
                i = encodeVarintMetrics(dAtA, i, uint64(len(m.Name)))
                i--
                dAtA[i] = 0xa
        }
        return len(dAtA) - i, nil
}

func encodeVarintMetrics(dAtA []byte, offset int, v uint64) int {
        offset -= sovMetrics(v)
        base := offset
        for v >= 1<<7 {
                dAtA[offset] = uint8(v&0x7f | 0x80)
                v >>= 7
                offset++
        }
        dAtA[offset] = uint8(v)
        return base
}
func (m *LabelPair) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        l = len(m.Name)
        if l > 0 {
                n += 1 + l + sovMetrics(uint64(l))
        }
        l = len(m.Value)
        if l > 0 {
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *Gauge) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.Value != 0 {
                n += 9
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *Counter) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.Value != 0 {
                n += 9
        }
        if m.Exemplar != nil {
                l = m.Exemplar.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.CreatedTimestamp != nil {
                l = m.CreatedTimestamp.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *Quantile) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.Quantile != 0 {
                n += 9
        }
        if m.Value != 0 {
                n += 9
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *Summary) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.SampleCount != 0 {
                n += 1 + sovMetrics(uint64(m.SampleCount))
        }
        if m.SampleSum != 0 {
                n += 9
        }
        if len(m.Quantile) > 0 {
                for _, e := range m.Quantile {
                        l = e.Size()
                        n += 1 + l + sovMetrics(uint64(l))
                }
        }
        if m.CreatedTimestamp != nil {
                l = m.CreatedTimestamp.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *Untyped) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.Value != 0 {
                n += 9
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *Histogram) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.SampleCount != 0 {
                n += 1 + sovMetrics(uint64(m.SampleCount))
        }
        if m.SampleSum != 0 {
                n += 9
        }
        if len(m.Bucket) > 0 {
                for _, e := range m.Bucket {
                        l = e.Size()
                        n += 1 + l + sovMetrics(uint64(l))
                }
        }
        if m.SampleCountFloat != 0 {
                n += 9
        }
        if m.Schema != 0 {
                n += 1 + sozMetrics(uint64(m.Schema))
        }
        if m.ZeroThreshold != 0 {
                n += 9
        }
        if m.ZeroCount != 0 {
                n += 1 + sovMetrics(uint64(m.ZeroCount))
        }
        if m.ZeroCountFloat != 0 {
                n += 9
        }
        if len(m.NegativeSpan) > 0 {
                for _, e := range m.NegativeSpan {
                        l = e.Size()
                        n += 1 + l + sovMetrics(uint64(l))
                }
        }
        if len(m.NegativeDelta) > 0 {
                l = 0
                for _, e := range m.NegativeDelta {
                        l += sozMetrics(uint64(e))
                }
                n += 1 + sovMetrics(uint64(l)) + l
        }
        if len(m.NegativeCount) > 0 {
                n += 1 + sovMetrics(uint64(len(m.NegativeCount)*8)) + len(m.NegativeCount)*8
        }
        if len(m.PositiveSpan) > 0 {
                for _, e := range m.PositiveSpan {
                        l = e.Size()
                        n += 1 + l + sovMetrics(uint64(l))
                }
        }
        if len(m.PositiveDelta) > 0 {
                l = 0
                for _, e := range m.PositiveDelta {
                        l += sozMetrics(uint64(e))
                }
                n += 1 + sovMetrics(uint64(l)) + l
        }
        if len(m.PositiveCount) > 0 {
                n += 1 + sovMetrics(uint64(len(m.PositiveCount)*8)) + len(m.PositiveCount)*8
        }
        if m.CreatedTimestamp != nil {
                l = m.CreatedTimestamp.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if len(m.Exemplars) > 0 {
                for _, e := range m.Exemplars {
                        l = e.Size()
                        n += 2 + l + sovMetrics(uint64(l))
                }
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *Bucket) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.CumulativeCount != 0 {
                n += 1 + sovMetrics(uint64(m.CumulativeCount))
        }
        if m.UpperBound != 0 {
                n += 9
        }
        if m.Exemplar != nil {
                l = m.Exemplar.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.CumulativeCountFloat != 0 {
                n += 9
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *BucketSpan) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.Offset != 0 {
                n += 1 + sozMetrics(uint64(m.Offset))
        }
        if m.Length != 0 {
                n += 1 + sovMetrics(uint64(m.Length))
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *Exemplar) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if len(m.Label) > 0 {
                for _, e := range m.Label {
                        l = e.Size()
                        n += 1 + l + sovMetrics(uint64(l))
                }
        }
        if m.Value != 0 {
                n += 9
        }
        if m.Timestamp != nil {
                l = m.Timestamp.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *Metric) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if len(m.Label) > 0 {
                for _, e := range m.Label {
                        l = e.Size()
                        n += 1 + l + sovMetrics(uint64(l))
                }
        }
        if m.Gauge != nil {
                l = m.Gauge.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.Counter != nil {
                l = m.Counter.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.Summary != nil {
                l = m.Summary.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.Untyped != nil {
                l = m.Untyped.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.TimestampMs != 0 {
                n += 1 + sovMetrics(uint64(m.TimestampMs))
        }
        if m.Histogram != nil {
                l = m.Histogram.Size()
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func (m *MetricFamily) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        l = len(m.Name)
        if l > 0 {
                n += 1 + l + sovMetrics(uint64(l))
        }
        l = len(m.Help)
        if l > 0 {
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.Type != 0 {
                n += 1 + sovMetrics(uint64(m.Type))
        }
        if len(m.Metric) > 0 {
                for _, e := range m.Metric {
                        l = e.Size()
                        n += 1 + l + sovMetrics(uint64(l))
                }
        }
        l = len(m.Unit)
        if l > 0 {
                n += 1 + l + sovMetrics(uint64(l))
        }
        if m.XXX_unrecognized != nil {
                n += len(m.XXX_unrecognized)
        }
        return n
}

func sovMetrics(x uint64) (n int) {
        return (math_bits.Len64(x|1) + 6) / 7
}
func sozMetrics(x uint64) (n int) {
        return sovMetrics(uint64((x << 1) ^ uint64((int64(x) >> 63))))
}
func (m *LabelPair) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: LabelPair: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: LabelPair: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Name = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 2:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Value = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *Gauge) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: Gauge: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: Gauge: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.Value = float64(math.Float64frombits(v))
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *Counter) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: Counter: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: Counter: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.Value = float64(math.Float64frombits(v))
                case 2:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Exemplar", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.Exemplar == nil {
                                m.Exemplar = &Exemplar{}
                        }
                        if err := m.Exemplar.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field CreatedTimestamp", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.CreatedTimestamp == nil {
                                m.CreatedTimestamp = &types.Timestamp{}
                        }
                        if err := m.CreatedTimestamp.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *Quantile) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: Quantile: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: Quantile: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Quantile", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.Quantile = float64(math.Float64frombits(v))
                case 2:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.Value = float64(math.Float64frombits(v))
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *Summary) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: Summary: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: Summary: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field SampleCount", wireType)
                        }
                        m.SampleCount = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.SampleCount |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 2:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field SampleSum", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.SampleSum = float64(math.Float64frombits(v))
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Quantile", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Quantile = append(m.Quantile, Quantile{})
                        if err := m.Quantile[len(m.Quantile)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 4:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field CreatedTimestamp", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.CreatedTimestamp == nil {
                                m.CreatedTimestamp = &types.Timestamp{}
                        }
                        if err := m.CreatedTimestamp.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *Untyped) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: Untyped: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: Untyped: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.Value = float64(math.Float64frombits(v))
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *Histogram) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: Histogram: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: Histogram: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field SampleCount", wireType)
                        }
                        m.SampleCount = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.SampleCount |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 2:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field SampleSum", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.SampleSum = float64(math.Float64frombits(v))
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Bucket", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Bucket = append(m.Bucket, Bucket{})
                        if err := m.Bucket[len(m.Bucket)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 4:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field SampleCountFloat", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.SampleCountFloat = float64(math.Float64frombits(v))
                case 5:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Schema", wireType)
                        }
                        var v int32
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                v |= int32(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        v = int32((uint32(v) >> 1) ^ uint32(((v&1)<<31)>>31))
                        m.Schema = v
                case 6:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field ZeroThreshold", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.ZeroThreshold = float64(math.Float64frombits(v))
                case 7:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field ZeroCount", wireType)
                        }
                        m.ZeroCount = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.ZeroCount |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 8:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field ZeroCountFloat", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.ZeroCountFloat = float64(math.Float64frombits(v))
                case 9:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NegativeSpan", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NegativeSpan = append(m.NegativeSpan, BucketSpan{})
                        if err := m.NegativeSpan[len(m.NegativeSpan)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 10:
                        if wireType == 0 {
                                var v uint64
                                for shift := uint(0); ; shift += 7 {
                                        if shift >= 64 {
                                                return ErrIntOverflowMetrics
                                        }
                                        if iNdEx >= l {
                                                return io.ErrUnexpectedEOF
                                        }
                                        b := dAtA[iNdEx]
                                        iNdEx++
                                        v |= uint64(b&0x7F) << shift
                                        if b < 0x80 {
                                                break
                                        }
                                }
                                v = (v >> 1) ^ uint64((int64(v&1)<<63)>>63)
                                m.NegativeDelta = append(m.NegativeDelta, int64(v))
                        } else if wireType == 2 {
                                var packedLen int
                                for shift := uint(0); ; shift += 7 {
                                        if shift >= 64 {
                                                return ErrIntOverflowMetrics
                                        }
                                        if iNdEx >= l {
                                                return io.ErrUnexpectedEOF
                                        }
                                        b := dAtA[iNdEx]
                                        iNdEx++
                                        packedLen |= int(b&0x7F) << shift
                                        if b < 0x80 {
                                                break
                                        }
                                }
                                if packedLen < 0 {
                                        return ErrInvalidLengthMetrics
                                }
                                postIndex := iNdEx + packedLen
                                if postIndex < 0 {
                                        return ErrInvalidLengthMetrics
                                }
                                if postIndex > l {
                                        return io.ErrUnexpectedEOF
                                }
                                var elementCount int
                                var count int
                                for _, integer := range dAtA[iNdEx:postIndex] {
                                        if integer < 128 {
                                                count++
                                        }
                                }
                                elementCount = count
                                if elementCount != 0 && len(m.NegativeDelta) == 0 {
                                        m.NegativeDelta = make([]int64, 0, elementCount)
                                }
                                for iNdEx < postIndex {
                                        var v uint64
                                        for shift := uint(0); ; shift += 7 {
                                                if shift >= 64 {
                                                        return ErrIntOverflowMetrics
                                                }
                                                if iNdEx >= l {
                                                        return io.ErrUnexpectedEOF
                                                }
                                                b := dAtA[iNdEx]
                                                iNdEx++
                                                v |= uint64(b&0x7F) << shift
                                                if b < 0x80 {
                                                        break
                                                }
                                        }
                                        v = (v >> 1) ^ uint64((int64(v&1)<<63)>>63)
                                        m.NegativeDelta = append(m.NegativeDelta, int64(v))
                                }
                        } else {
                                return fmt.Errorf("proto: wrong wireType = %d for field NegativeDelta", wireType)
                        }
                case 11:
                        if wireType == 1 {
                                var v uint64
                                if (iNdEx + 8) > l {
                                        return io.ErrUnexpectedEOF
                                }
                                v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                                iNdEx += 8
                                v2 := float64(math.Float64frombits(v))
                                m.NegativeCount = append(m.NegativeCount, v2)
                        } else if wireType == 2 {
                                var packedLen int
                                for shift := uint(0); ; shift += 7 {
                                        if shift >= 64 {
                                                return ErrIntOverflowMetrics
                                        }
                                        if iNdEx >= l {
                                                return io.ErrUnexpectedEOF
                                        }
                                        b := dAtA[iNdEx]
                                        iNdEx++
                                        packedLen |= int(b&0x7F) << shift
                                        if b < 0x80 {
                                                break
                                        }
                                }
                                if packedLen < 0 {
                                        return ErrInvalidLengthMetrics
                                }
                                postIndex := iNdEx + packedLen
                                if postIndex < 0 {
                                        return ErrInvalidLengthMetrics
                                }
                                if postIndex > l {
                                        return io.ErrUnexpectedEOF
                                }
                                var elementCount int
                                elementCount = packedLen / 8
                                if elementCount != 0 && len(m.NegativeCount) == 0 {
                                        m.NegativeCount = make([]float64, 0, elementCount)
                                }
                                for iNdEx < postIndex {
                                        var v uint64
                                        if (iNdEx + 8) > l {
                                                return io.ErrUnexpectedEOF
                                        }
                                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                                        iNdEx += 8
                                        v2 := float64(math.Float64frombits(v))
                                        m.NegativeCount = append(m.NegativeCount, v2)
                                }
                        } else {
                                return fmt.Errorf("proto: wrong wireType = %d for field NegativeCount", wireType)
                        }
                case 12:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field PositiveSpan", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.PositiveSpan = append(m.PositiveSpan, BucketSpan{})
                        if err := m.PositiveSpan[len(m.PositiveSpan)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 13:
                        if wireType == 0 {
                                var v uint64
                                for shift := uint(0); ; shift += 7 {
                                        if shift >= 64 {
                                                return ErrIntOverflowMetrics
                                        }
                                        if iNdEx >= l {
                                                return io.ErrUnexpectedEOF
                                        }
                                        b := dAtA[iNdEx]
                                        iNdEx++
                                        v |= uint64(b&0x7F) << shift
                                        if b < 0x80 {
                                                break
                                        }
                                }
                                v = (v >> 1) ^ uint64((int64(v&1)<<63)>>63)
                                m.PositiveDelta = append(m.PositiveDelta, int64(v))
                        } else if wireType == 2 {
                                var packedLen int
                                for shift := uint(0); ; shift += 7 {
                                        if shift >= 64 {
                                                return ErrIntOverflowMetrics
                                        }
                                        if iNdEx >= l {
                                                return io.ErrUnexpectedEOF
                                        }
                                        b := dAtA[iNdEx]
                                        iNdEx++
                                        packedLen |= int(b&0x7F) << shift
                                        if b < 0x80 {
                                                break
                                        }
                                }
                                if packedLen < 0 {
                                        return ErrInvalidLengthMetrics
                                }
                                postIndex := iNdEx + packedLen
                                if postIndex < 0 {
                                        return ErrInvalidLengthMetrics
                                }
                                if postIndex > l {
                                        return io.ErrUnexpectedEOF
                                }
                                var elementCount int
                                var count int
                                for _, integer := range dAtA[iNdEx:postIndex] {
                                        if integer < 128 {
                                                count++
                                        }
                                }
                                elementCount = count
                                if elementCount != 0 && len(m.PositiveDelta) == 0 {
                                        m.PositiveDelta = make([]int64, 0, elementCount)
                                }
                                for iNdEx < postIndex {
                                        var v uint64
                                        for shift := uint(0); ; shift += 7 {
                                                if shift >= 64 {
                                                        return ErrIntOverflowMetrics
                                                }
                                                if iNdEx >= l {
                                                        return io.ErrUnexpectedEOF
                                                }
                                                b := dAtA[iNdEx]
                                                iNdEx++
                                                v |= uint64(b&0x7F) << shift
                                                if b < 0x80 {
                                                        break
                                                }
                                        }
                                        v = (v >> 1) ^ uint64((int64(v&1)<<63)>>63)
                                        m.PositiveDelta = append(m.PositiveDelta, int64(v))
                                }
                        } else {
                                return fmt.Errorf("proto: wrong wireType = %d for field PositiveDelta", wireType)
                        }
                case 14:
                        if wireType == 1 {
                                var v uint64
                                if (iNdEx + 8) > l {
                                        return io.ErrUnexpectedEOF
                                }
                                v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                                iNdEx += 8
                                v2 := float64(math.Float64frombits(v))
                                m.PositiveCount = append(m.PositiveCount, v2)
                        } else if wireType == 2 {
                                var packedLen int
                                for shift := uint(0); ; shift += 7 {
                                        if shift >= 64 {
                                                return ErrIntOverflowMetrics
                                        }
                                        if iNdEx >= l {
                                                return io.ErrUnexpectedEOF
                                        }
                                        b := dAtA[iNdEx]
                                        iNdEx++
                                        packedLen |= int(b&0x7F) << shift
                                        if b < 0x80 {
                                                break
                                        }
                                }
                                if packedLen < 0 {
                                        return ErrInvalidLengthMetrics
                                }
                                postIndex := iNdEx + packedLen
                                if postIndex < 0 {
                                        return ErrInvalidLengthMetrics
                                }
                                if postIndex > l {
                                        return io.ErrUnexpectedEOF
                                }
                                var elementCount int
                                elementCount = packedLen / 8
                                if elementCount != 0 && len(m.PositiveCount) == 0 {
                                        m.PositiveCount = make([]float64, 0, elementCount)
                                }
                                for iNdEx < postIndex {
                                        var v uint64
                                        if (iNdEx + 8) > l {
                                                return io.ErrUnexpectedEOF
                                        }
                                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                                        iNdEx += 8
                                        v2 := float64(math.Float64frombits(v))
                                        m.PositiveCount = append(m.PositiveCount, v2)
                                }
                        } else {
                                return fmt.Errorf("proto: wrong wireType = %d for field PositiveCount", wireType)
                        }
                case 15:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field CreatedTimestamp", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.CreatedTimestamp == nil {
                                m.CreatedTimestamp = &types.Timestamp{}
                        }
                        if err := m.CreatedTimestamp.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 16:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Exemplars", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Exemplars = append(m.Exemplars, &Exemplar{})
                        if err := m.Exemplars[len(m.Exemplars)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *Bucket) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: Bucket: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: Bucket: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field CumulativeCount", wireType)
                        }
                        m.CumulativeCount = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.CumulativeCount |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 2:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field UpperBound", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.UpperBound = float64(math.Float64frombits(v))
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Exemplar", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.Exemplar == nil {
                                m.Exemplar = &Exemplar{}
                        }
                        if err := m.Exemplar.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 4:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field CumulativeCountFloat", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.CumulativeCountFloat = float64(math.Float64frombits(v))
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *BucketSpan) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: BucketSpan: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: BucketSpan: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Offset", wireType)
                        }
                        var v int32
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                v |= int32(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        v = int32((uint32(v) >> 1) ^ uint32(((v&1)<<31)>>31))
                        m.Offset = v
                case 2:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Length", wireType)
                        }
                        m.Length = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.Length |= uint32(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *Exemplar) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: Exemplar: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: Exemplar: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Label", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Label = append(m.Label, LabelPair{})
                        if err := m.Label[len(m.Label)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 2:
                        if wireType != 1 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
                        }
                        var v uint64
                        if (iNdEx + 8) > l {
                                return io.ErrUnexpectedEOF
                        }
                        v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:]))
                        iNdEx += 8
                        m.Value = float64(math.Float64frombits(v))
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Timestamp", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.Timestamp == nil {
                                m.Timestamp = &types.Timestamp{}
                        }
                        if err := m.Timestamp.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *Metric) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: Metric: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: Metric: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Label", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Label = append(m.Label, LabelPair{})
                        if err := m.Label[len(m.Label)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 2:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Gauge", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.Gauge == nil {
                                m.Gauge = &Gauge{}
                        }
                        if err := m.Gauge.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Counter", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.Counter == nil {
                                m.Counter = &Counter{}
                        }
                        if err := m.Counter.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 4:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Summary", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.Summary == nil {
                                m.Summary = &Summary{}
                        }
                        if err := m.Summary.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 5:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Untyped", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.Untyped == nil {
                                m.Untyped = &Untyped{}
                        }
                        if err := m.Untyped.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 6:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field TimestampMs", wireType)
                        }
                        m.TimestampMs = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.TimestampMs |= int64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 7:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Histogram", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        if m.Histogram == nil {
                                m.Histogram = &Histogram{}
                        }
                        if err := m.Histogram.Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *MetricFamily) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: MetricFamily: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: MetricFamily: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Name = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 2:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Help", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Help = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 3:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Type", wireType)
                        }
                        m.Type = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.Type |= MetricType(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 4:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Metric", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Metric = append(m.Metric, Metric{})
                        if err := m.Metric[len(m.Metric)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 5:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Unit", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Unit = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipMetrics(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthMetrics
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func skipMetrics(dAtA []byte) (n int, err error) {
        l := len(dAtA)
        iNdEx := 0
        depth := 0
        for iNdEx < l {
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return 0, ErrIntOverflowMetrics
                        }
                        if iNdEx >= l {
                                return 0, io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= (uint64(b) & 0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                wireType := int(wire & 0x7)
                switch wireType {
                case 0:
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                iNdEx++
                                if dAtA[iNdEx-1] < 0x80 {
                                        break
                                }
                        }
                case 1:
                        iNdEx += 8
                case 2:
                        var length int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowMetrics
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                length |= (int(b) & 0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if length < 0 {
                                return 0, ErrInvalidLengthMetrics
                        }
                        iNdEx += length
                case 3:
                        depth++
                case 4:
                        if depth == 0 {
                                return 0, ErrUnexpectedEndOfGroupMetrics
                        }
                        depth--
                case 5:
                        iNdEx += 4
                default:
                        return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
                }
                if iNdEx < 0 {
                        return 0, ErrInvalidLengthMetrics
                }
                if depth == 0 {
                        return iNdEx, nil
                }
        }
        return 0, io.ErrUnexpectedEOF
}

var (
        ErrInvalidLengthMetrics        = fmt.Errorf("proto: negative length found during unmarshaling")
        ErrIntOverflowMetrics          = fmt.Errorf("proto: integer overflow")
        ErrUnexpectedEndOfGroupMetrics = fmt.Errorf("proto: unexpected end of group")
)

// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package promql

import (
        "bytes"
        "container/heap"
        "context"
        "errors"
        "fmt"
        "math"
        "reflect"
        "runtime"
        "slices"
        "sort"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/prometheus/common/model"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/timestamp"
        "github.com/prometheus/prometheus/model/value"
        "github.com/prometheus/prometheus/promql/parser"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/util/annotations"
        "github.com/prometheus/prometheus/util/stats"
        "github.com/prometheus/prometheus/util/zeropool"
)

const (
        namespace            = "prometheus"
        subsystem            = "engine"
        queryTag             = "query"
        env                  = "query execution"
        defaultLookbackDelta = 5 * time.Minute

        // The largest SampleValue that can be converted to an int64 without overflow.
        maxInt64 = 9223372036854774784
        // The smallest SampleValue that can be converted to an int64 without underflow.
        minInt64 = -9223372036854775808

        // Max initial size for the pooled points slices.
        // The getHPointSlice and getFPointSlice functions are called with an estimated size which often can be
        // over-estimated.
        maxPointsSliceSize = 5000

        // The default buffer size for points used by the matrix selector.
        matrixSelectorSliceSize = 16
)

type engineMetrics struct {
        currentQueries       prometheus.Gauge
        maxConcurrentQueries prometheus.Gauge
        queryLogEnabled      prometheus.Gauge
        queryLogFailures     prometheus.Counter
        queryQueueTime       prometheus.Observer
        queryPrepareTime     prometheus.Observer
        queryInnerEval       prometheus.Observer
        queryResultSort      prometheus.Observer
        querySamples         prometheus.Counter
}

// convertibleToInt64 returns true if v does not over-/underflow an int64.
func convertibleToInt64(v float64) bool {
        return v <= maxInt64 && v >= minInt64
}

type (
        // ErrQueryTimeout is returned if a query timed out during processing.
        ErrQueryTimeout string
        // ErrQueryCanceled is returned if a query was canceled during processing.
        ErrQueryCanceled string
        // ErrTooManySamples is returned if a query would load more than the maximum allowed samples into memory.
        ErrTooManySamples string
        // ErrStorage is returned if an error was encountered in the storage layer
        // during query handling.
        ErrStorage struct{ Err error }
)

func (e ErrQueryTimeout) Error() string {
        return fmt.Sprintf("query timed out in %s", string(e))
}

func (e ErrQueryCanceled) Error() string {
        return fmt.Sprintf("query was canceled in %s", string(e))
}

func (e ErrTooManySamples) Error() string {
        return fmt.Sprintf("query processing would load too many samples into memory in %s", string(e))
}

func (e ErrStorage) Error() string {
        return e.Err.Error()
}

// QueryEngine defines the interface for the *promql.Engine, so it can be replaced, wrapped or mocked.
type QueryEngine interface {
        NewInstantQuery(ctx context.Context, q storage.Queryable, opts QueryOpts, qs string, ts time.Time) (Query, error)
        NewRangeQuery(ctx context.Context, q storage.Queryable, opts QueryOpts, qs string, start, end time.Time, interval time.Duration) (Query, error)
}

// QueryLogger is an interface that can be used to log all the queries logged
// by the engine.
type QueryLogger interface {
        Log(...interface{}) error
        Close() error
}

// A Query is derived from an a raw query string and can be run against an engine
// it is associated with.
type Query interface {
        // Exec processes the query. Can only be called once.
        Exec(ctx context.Context) *Result
        // Close recovers memory used by the query result.
        Close()
        // Statement returns the parsed statement of the query.
        Statement() parser.Statement
        // Stats returns statistics about the lifetime of the query.
        Stats() *stats.Statistics
        // Cancel signals that a running query execution should be aborted.
        Cancel()
        // String returns the original query string.
        String() string
}

type PrometheusQueryOpts struct {
        // Enables recording per-step statistics if the engine has it enabled as well. Disabled by default.
        enablePerStepStats bool
        // Lookback delta duration for this query.
        lookbackDelta time.Duration
}

var _ QueryOpts = &PrometheusQueryOpts{}

func NewPrometheusQueryOpts(enablePerStepStats bool, lookbackDelta time.Duration) QueryOpts {
        return &PrometheusQueryOpts{
                enablePerStepStats: enablePerStepStats,
                lookbackDelta:      lookbackDelta,
        }
}

func (p *PrometheusQueryOpts) EnablePerStepStats() bool {
        return p.enablePerStepStats
}

func (p *PrometheusQueryOpts) LookbackDelta() time.Duration {
        return p.lookbackDelta
}

type QueryOpts interface {
        // Enables recording per-step statistics if the engine has it enabled as well. Disabled by default.
        EnablePerStepStats() bool
        // Lookback delta duration for this query.
        LookbackDelta() time.Duration
}

// query implements the Query interface.
type query struct {
        // Underlying data provider.
        queryable storage.Queryable
        // The original query string.
        q string
        // Statement of the parsed query.
        stmt parser.Statement
        // Timer stats for the query execution.
        stats *stats.QueryTimers
        // Sample stats for the query execution.
        sampleStats *stats.QuerySamples
        // Result matrix for reuse.
        matrix Matrix
        // Cancellation function for the query.
        cancel func()

        // The engine against which the query is executed.
        ng *Engine
}

type QueryOrigin struct{}

// Statement implements the Query interface.
// Calling this after Exec may result in panic,
// see https://github.com/prometheus/prometheus/issues/8949.
func (q *query) Statement() parser.Statement {
        return q.stmt
}

// String implements the Query interface.
func (q *query) String() string {
        return q.q
}

// Stats implements the Query interface.
func (q *query) Stats() *stats.Statistics {
        return &stats.Statistics{
                Timers:  q.stats,
                Samples: q.sampleStats,
        }
}

// Cancel implements the Query interface.
func (q *query) Cancel() {
        if q.cancel != nil {
                q.cancel()
        }
}

// Close implements the Query interface.
func (q *query) Close() {
        for _, s := range q.matrix {
                putFPointSlice(s.Floats)
                putHPointSlice(s.Histograms)
        }
}

// Exec implements the Query interface.
func (q *query) Exec(ctx context.Context) *Result {
        if span := trace.SpanFromContext(ctx); span != nil {
                span.SetAttributes(attribute.String(queryTag, q.stmt.String()))
        }

        // Exec query.
        res, warnings, err := q.ng.exec(ctx, q)
        return &Result{Err: err, Value: res, Warnings: warnings}
}

// contextDone returns an error if the context was canceled or timed out.
func contextDone(ctx context.Context, env string) error {
        if err := ctx.Err(); err != nil {
                return contextErr(err, env)
        }
        return nil
}

func contextErr(err error, env string) error {
        switch {
        case errors.Is(err, context.Canceled):
                return ErrQueryCanceled(env)
        case errors.Is(err, context.DeadlineExceeded):
                return ErrQueryTimeout(env)
        default:
                return err
        }
}

// QueryTracker provides access to two features:
//
// 1) Tracking of active query. If PromQL engine crashes while executing any query, such query should be present
// in the tracker on restart, hence logged. After the logging on restart, the tracker gets emptied.
//
// 2) Enforcement of the maximum number of concurrent queries.
type QueryTracker interface {
        // GetMaxConcurrent returns maximum number of concurrent queries that are allowed by this tracker.
        GetMaxConcurrent() int

        // Insert inserts query into query tracker. This call must block if maximum number of queries is already running.
        // If Insert doesn't return error then returned integer value should be used in subsequent Delete call.
        // Insert should return error if context is finished before query can proceed, and integer value returned in this case should be ignored by caller.
        Insert(ctx context.Context, query string) (int, error)

        // Delete removes query from activity tracker. InsertIndex is value returned by Insert call.
        Delete(insertIndex int)
}

// EngineOpts contains configuration options used when creating a new Engine.
type EngineOpts struct {
        Logger             log.Logger
        Reg                prometheus.Registerer
        MaxSamples         int
        Timeout            time.Duration
        ActiveQueryTracker QueryTracker
        // LookbackDelta determines the time since the last sample after which a time
        // series is considered stale.
        LookbackDelta time.Duration

        // NoStepSubqueryIntervalFn is the default evaluation interval of
        // a subquery in milliseconds if no step in range vector was specified `[30m:<step>]`.
        NoStepSubqueryIntervalFn func(rangeMillis int64) int64

        // EnableAtModifier if true enables @ modifier. Disabled otherwise. This
        // is supposed to be enabled for regular PromQL (as of Prometheus v2.33)
        // but the option to disable it is still provided here for those using
        // the Engine outside of Prometheus.
        EnableAtModifier bool

        // EnableNegativeOffset if true enables negative (-) offset
        // values. Disabled otherwise. This is supposed to be enabled for
        // regular PromQL (as of Prometheus v2.33) but the option to disable it
        // is still provided here for those using the Engine outside of
        // Prometheus.
        EnableNegativeOffset bool

        // EnablePerStepStats if true allows for per-step stats to be computed on request. Disabled otherwise.
        EnablePerStepStats bool
}

// Engine handles the lifetime of queries from beginning to end.
// It is connected to a querier.
type Engine struct {
        logger                   log.Logger
        metrics                  *engineMetrics
        timeout                  time.Duration
        maxSamplesPerQuery       int
        activeQueryTracker       QueryTracker
        queryLogger              QueryLogger
        queryLoggerLock          sync.RWMutex
        lookbackDelta            time.Duration
        noStepSubqueryIntervalFn func(rangeMillis int64) int64
        enableAtModifier         bool
        enableNegativeOffset     bool
        enablePerStepStats       bool
}

// NewEngine returns a new engine.
func NewEngine(opts EngineOpts) *Engine {
        if opts.Logger == nil {
                opts.Logger = log.NewNopLogger()
        }

        queryResultSummary := prometheus.NewSummaryVec(prometheus.SummaryOpts{
                Namespace:  namespace,
                Subsystem:  subsystem,
                Name:       "query_duration_seconds",
                Help:       "Query timings",
                Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
        },
                []string{"slice"},
        )

        metrics := &engineMetrics{
                currentQueries: prometheus.NewGauge(prometheus.GaugeOpts{
                        Namespace: namespace,
                        Subsystem: subsystem,
                        Name:      "queries",
                        Help:      "The current number of queries being executed or waiting.",
                }),
                queryLogEnabled: prometheus.NewGauge(prometheus.GaugeOpts{
                        Namespace: namespace,
                        Subsystem: subsystem,
                        Name:      "query_log_enabled",
                        Help:      "State of the query log.",
                }),
                queryLogFailures: prometheus.NewCounter(prometheus.CounterOpts{
                        Namespace: namespace,
                        Subsystem: subsystem,
                        Name:      "query_log_failures_total",
                        Help:      "The number of query log failures.",
                }),
                maxConcurrentQueries: prometheus.NewGauge(prometheus.GaugeOpts{
                        Namespace: namespace,
                        Subsystem: subsystem,
                        Name:      "queries_concurrent_max",
                        Help:      "The max number of concurrent queries.",
                }),
                querySamples: prometheus.NewCounter(prometheus.CounterOpts{
                        Namespace: namespace,
                        Subsystem: subsystem,
                        Name:      "query_samples_total",
                        Help:      "The total number of samples loaded by all queries.",
                }),
                queryQueueTime:   queryResultSummary.WithLabelValues("queue_time"),
                queryPrepareTime: queryResultSummary.WithLabelValues("prepare_time"),
                queryInnerEval:   queryResultSummary.WithLabelValues("inner_eval"),
                queryResultSort:  queryResultSummary.WithLabelValues("result_sort"),
        }

        if t := opts.ActiveQueryTracker; t != nil {
                metrics.maxConcurrentQueries.Set(float64(t.GetMaxConcurrent()))
        } else {
                metrics.maxConcurrentQueries.Set(-1)
        }

        if opts.LookbackDelta == 0 {
                opts.LookbackDelta = defaultLookbackDelta
                if l := opts.Logger; l != nil {
                        level.Debug(l).Log("msg", "Lookback delta is zero, setting to default value", "value", defaultLookbackDelta)
                }
        }

        if opts.Reg != nil {
                opts.Reg.MustRegister(
                        metrics.currentQueries,
                        metrics.maxConcurrentQueries,
                        metrics.queryLogEnabled,
                        metrics.queryLogFailures,
                        metrics.querySamples,
                        queryResultSummary,
                )
        }

        return &Engine{
                timeout:                  opts.Timeout,
                logger:                   opts.Logger,
                metrics:                  metrics,
                maxSamplesPerQuery:       opts.MaxSamples,
                activeQueryTracker:       opts.ActiveQueryTracker,
                lookbackDelta:            opts.LookbackDelta,
                noStepSubqueryIntervalFn: opts.NoStepSubqueryIntervalFn,
                enableAtModifier:         opts.EnableAtModifier,
                enableNegativeOffset:     opts.EnableNegativeOffset,
                enablePerStepStats:       opts.EnablePerStepStats,
        }
}

// SetQueryLogger sets the query logger.
func (ng *Engine) SetQueryLogger(l QueryLogger) {
        ng.queryLoggerLock.Lock()
        defer ng.queryLoggerLock.Unlock()

        if ng.queryLogger != nil {
                // An error closing the old file descriptor should
                // not make reload fail; only log a warning.
                err := ng.queryLogger.Close()
                if err != nil {
                        level.Warn(ng.logger).Log("msg", "Error while closing the previous query log file", "err", err)
                }
        }

        ng.queryLogger = l

        if l != nil {
                ng.metrics.queryLogEnabled.Set(1)
        } else {
                ng.metrics.queryLogEnabled.Set(0)
        }
}

// NewInstantQuery returns an evaluation query for the given expression at the given time.
func (ng *Engine) NewInstantQuery(ctx context.Context, q storage.Queryable, opts QueryOpts, qs string, ts time.Time) (Query, error) {
        pExpr, qry := ng.newQuery(q, qs, opts, ts, ts, 0)
        finishQueue, err := ng.queueActive(ctx, qry)
        if err != nil {
                return nil, err
        }
        defer finishQueue()
        expr, err := parser.ParseExpr(qs)
        if err != nil {
                return nil, err
        }
        if err := ng.validateOpts(expr); err != nil {
                return nil, err
        }
        *pExpr = PreprocessExpr(expr, ts, ts)

        return qry, nil
}

// NewRangeQuery returns an evaluation query for the given time range and with
// the resolution set by the interval.
func (ng *Engine) NewRangeQuery(ctx context.Context, q storage.Queryable, opts QueryOpts, qs string, start, end time.Time, interval time.Duration) (Query, error) {
        pExpr, qry := ng.newQuery(q, qs, opts, start, end, interval)
        finishQueue, err := ng.queueActive(ctx, qry)
        if err != nil {
                return nil, err
        }
        defer finishQueue()
        expr, err := parser.ParseExpr(qs)
        if err != nil {
                return nil, err
        }
        if err := ng.validateOpts(expr); err != nil {
                return nil, err
        }
        if expr.Type() != parser.ValueTypeVector && expr.Type() != parser.ValueTypeScalar {
                return nil, fmt.Errorf("invalid expression type %q for range query, must be Scalar or instant Vector", parser.DocumentedType(expr.Type()))
        }
        *pExpr = PreprocessExpr(expr, start, end)

        return qry, nil
}

func (ng *Engine) newQuery(q storage.Queryable, qs string, opts QueryOpts, start, end time.Time, interval time.Duration) (*parser.Expr, *query) {
        if opts == nil {
                opts = NewPrometheusQueryOpts(false, 0)
        }

        lookbackDelta := opts.LookbackDelta()
        if lookbackDelta <= 0 {
                lookbackDelta = ng.lookbackDelta
        }

        es := &parser.EvalStmt{
                Start:         start,
                End:           end,
                Interval:      interval,
                LookbackDelta: lookbackDelta,
        }
        qry := &query{
                q:           qs,
                stmt:        es,
                ng:          ng,
                stats:       stats.NewQueryTimers(),
                sampleStats: stats.NewQuerySamples(ng.enablePerStepStats && opts.EnablePerStepStats()),
                queryable:   q,
        }
        return &es.Expr, qry
}

var (
        ErrValidationAtModifierDisabled     = errors.New("@ modifier is disabled")
        ErrValidationNegativeOffsetDisabled = errors.New("negative offset is disabled")
)

func (ng *Engine) validateOpts(expr parser.Expr) error {
        if ng.enableAtModifier && ng.enableNegativeOffset {
                return nil
        }

        var atModifierUsed, negativeOffsetUsed bool

        var validationErr error
        parser.Inspect(expr, func(node parser.Node, path []parser.Node) error {
                switch n := node.(type) {
                case *parser.VectorSelector:
                        if n.Timestamp != nil || n.StartOrEnd == parser.START || n.StartOrEnd == parser.END {
                                atModifierUsed = true
                        }
                        if n.OriginalOffset < 0 {
                                negativeOffsetUsed = true
                        }

                case *parser.MatrixSelector:
                        vs := n.VectorSelector.(*parser.VectorSelector)
                        if vs.Timestamp != nil || vs.StartOrEnd == parser.START || vs.StartOrEnd == parser.END {
                                atModifierUsed = true
                        }
                        if vs.OriginalOffset < 0 {
                                negativeOffsetUsed = true
                        }

                case *parser.SubqueryExpr:
                        if n.Timestamp != nil || n.StartOrEnd == parser.START || n.StartOrEnd == parser.END {
                                atModifierUsed = true
                        }
                        if n.OriginalOffset < 0 {
                                negativeOffsetUsed = true
                        }
                }

                if atModifierUsed && !ng.enableAtModifier {
                        validationErr = ErrValidationAtModifierDisabled
                        return validationErr
                }
                if negativeOffsetUsed && !ng.enableNegativeOffset {
                        validationErr = ErrValidationNegativeOffsetDisabled
                        return validationErr
                }

                return nil
        })

        return validationErr
}

// NewTestQuery: inject special behaviour into Query for testing.
func (ng *Engine) NewTestQuery(f func(context.Context) error) Query {
        qry := &query{
                q:           "test statement",
                stmt:        parser.TestStmt(f),
                ng:          ng,
                stats:       stats.NewQueryTimers(),
                sampleStats: stats.NewQuerySamples(ng.enablePerStepStats),
        }
        return qry
}

// exec executes the query.
//
// At this point per query only one EvalStmt is evaluated. Alert and record
// statements are not handled by the Engine.
func (ng *Engine) exec(ctx context.Context, q *query) (v parser.Value, ws annotations.Annotations, err error) {
        ng.metrics.currentQueries.Inc()
        defer func() {
                ng.metrics.currentQueries.Dec()
                ng.metrics.querySamples.Add(float64(q.sampleStats.TotalSamples))
        }()

        ctx, cancel := context.WithTimeout(ctx, ng.timeout)
        q.cancel = cancel

        defer func() {
                ng.queryLoggerLock.RLock()
                if l := ng.queryLogger; l != nil {
                        params := make(map[string]interface{}, 4)
                        params["query"] = q.q
                        if eq, ok := q.Statement().(*parser.EvalStmt); ok {
                                params["start"] = formatDate(eq.Start)
                                params["end"] = formatDate(eq.End)
                                // The step provided by the user is in seconds.
                                params["step"] = int64(eq.Interval / (time.Second / time.Nanosecond))
                        }
                        f := []interface{}{"params", params}
                        if err != nil {
                                f = append(f, "error", err)
                        }
                        f = append(f, "stats", stats.NewQueryStats(q.Stats()))
                        if span := trace.SpanFromContext(ctx); span != nil {
                                f = append(f, "spanID", span.SpanContext().SpanID())
                        }
                        if origin := ctx.Value(QueryOrigin{}); origin != nil {
                                for k, v := range origin.(map[string]interface{}) {
                                        f = append(f, k, v)
                                }
                        }
                        if err := l.Log(f...); err != nil {
                                ng.metrics.queryLogFailures.Inc()
                                level.Error(ng.logger).Log("msg", "can't log query", "err", err)
                        }
                }
                ng.queryLoggerLock.RUnlock()
        }()

        execSpanTimer, ctx := q.stats.GetSpanTimer(ctx, stats.ExecTotalTime)
        defer execSpanTimer.Finish()

        finishQueue, err := ng.queueActive(ctx, q)
        if err != nil {
                return nil, nil, err
        }
        defer finishQueue()

        // Cancel when execution is done or an error was raised.
        defer q.cancel()

        evalSpanTimer, ctx := q.stats.GetSpanTimer(ctx, stats.EvalTotalTime)
        defer evalSpanTimer.Finish()

        // The base context might already be canceled on the first iteration (e.g. during shutdown).
        if err := contextDone(ctx, env); err != nil {
                return nil, nil, err
        }

        switch s := q.Statement().(type) {
        case *parser.EvalStmt:
                return ng.execEvalStmt(ctx, q, s)
        case parser.TestStmt:
                return nil, nil, s(ctx)
        }

        panic(fmt.Errorf("promql.Engine.exec: unhandled statement of type %T", q.Statement()))
}

// Log query in active log. The active log guarantees that we don't run over
// MaxConcurrent queries.
func (ng *Engine) queueActive(ctx context.Context, q *query) (func(), error) {
        if ng.activeQueryTracker == nil {
                return func() {}, nil
        }
        queueSpanTimer, _ := q.stats.GetSpanTimer(ctx, stats.ExecQueueTime, ng.metrics.queryQueueTime)
        queryIndex, err := ng.activeQueryTracker.Insert(ctx, q.q)
        queueSpanTimer.Finish()
        return func() { ng.activeQueryTracker.Delete(queryIndex) }, err
}

func timeMilliseconds(t time.Time) int64 {
        return t.UnixNano() / int64(time.Millisecond/time.Nanosecond)
}

func durationMilliseconds(d time.Duration) int64 {
        return int64(d / (time.Millisecond / time.Nanosecond))
}

// execEvalStmt evaluates the expression of an evaluation statement for the given time range.
func (ng *Engine) execEvalStmt(ctx context.Context, query *query, s *parser.EvalStmt) (parser.Value, annotations.Annotations, error) {
        prepareSpanTimer, ctxPrepare := query.stats.GetSpanTimer(ctx, stats.QueryPreparationTime, ng.metrics.queryPrepareTime)
        mint, maxt := FindMinMaxTime(s)
        querier, err := query.queryable.Querier(mint, maxt)
        if err != nil {
                prepareSpanTimer.Finish()
                return nil, nil, err
        }
        defer querier.Close()

        ng.populateSeries(ctxPrepare, querier, s)
        prepareSpanTimer.Finish()

        // Modify the offset of vector and matrix selectors for the @ modifier
        // w.r.t. the start time since only 1 evaluation will be done on them.
        setOffsetForAtModifier(timeMilliseconds(s.Start), s.Expr)
        evalSpanTimer, ctxInnerEval := query.stats.GetSpanTimer(ctx, stats.InnerEvalTime, ng.metrics.queryInnerEval)
        // Instant evaluation. This is executed as a range evaluation with one step.
        if s.Start == s.End && s.Interval == 0 {
                start := timeMilliseconds(s.Start)
                evaluator := &evaluator{
                        startTimestamp:           start,
                        endTimestamp:             start,
                        interval:                 1,
                        ctx:                      ctxInnerEval,
                        maxSamples:               ng.maxSamplesPerQuery,
                        logger:                   ng.logger,
                        lookbackDelta:            s.LookbackDelta,
                        samplesStats:             query.sampleStats,
                        noStepSubqueryIntervalFn: ng.noStepSubqueryIntervalFn,
                }
                query.sampleStats.InitStepTracking(start, start, 1)

                val, warnings, err := evaluator.Eval(s.Expr)

                evalSpanTimer.Finish()

                if err != nil {
                        return nil, warnings, err
                }

                var mat Matrix

                switch result := val.(type) {
                case Matrix:
                        mat = result
                case String:
                        return result, warnings, nil
                default:
                        panic(fmt.Errorf("promql.Engine.exec: invalid expression type %q", val.Type()))
                }

                query.matrix = mat
                switch s.Expr.Type() {
                case parser.ValueTypeVector:
                        // Convert matrix with one value per series into vector.
                        vector := make(Vector, len(mat))
                        for i, s := range mat {
                                // Point might have a different timestamp, force it to the evaluation
                                // timestamp as that is when we ran the evaluation.
                                if len(s.Histograms) > 0 {
                                        vector[i] = Sample{Metric: s.Metric, H: s.Histograms[0].H, T: start}
                                } else {
                                        vector[i] = Sample{Metric: s.Metric, F: s.Floats[0].F, T: start}
                                }
                        }
                        return vector, warnings, nil
                case parser.ValueTypeScalar:
                        return Scalar{V: mat[0].Floats[0].F, T: start}, warnings, nil
                case parser.ValueTypeMatrix:
                        ng.sortMatrixResult(ctx, query, mat)
                        return mat, warnings, nil
                default:
                        panic(fmt.Errorf("promql.Engine.exec: unexpected expression type %q", s.Expr.Type()))
                }
        }

        // Range evaluation.
        evaluator := &evaluator{
                startTimestamp:           timeMilliseconds(s.Start),
                endTimestamp:             timeMilliseconds(s.End),
                interval:                 durationMilliseconds(s.Interval),
                ctx:                      ctxInnerEval,
                maxSamples:               ng.maxSamplesPerQuery,
                logger:                   ng.logger,
                lookbackDelta:            s.LookbackDelta,
                samplesStats:             query.sampleStats,
                noStepSubqueryIntervalFn: ng.noStepSubqueryIntervalFn,
        }
        query.sampleStats.InitStepTracking(evaluator.startTimestamp, evaluator.endTimestamp, evaluator.interval)
        val, warnings, err := evaluator.Eval(s.Expr)

        evalSpanTimer.Finish()

        if err != nil {
                return nil, warnings, err
        }

        mat, ok := val.(Matrix)
        if !ok {
                panic(fmt.Errorf("promql.Engine.exec: invalid expression type %q", val.Type()))
        }
        query.matrix = mat

        if err := contextDone(ctx, "expression evaluation"); err != nil {
                return nil, warnings, err
        }

        // TODO(fabxc): where to ensure metric labels are a copy from the storage internals.
        ng.sortMatrixResult(ctx, query, mat)

        return mat, warnings, nil
}

func (ng *Engine) sortMatrixResult(ctx context.Context, query *query, mat Matrix) {
        sortSpanTimer, _ := query.stats.GetSpanTimer(ctx, stats.ResultSortTime, ng.metrics.queryResultSort)
        sort.Sort(mat)
        sortSpanTimer.Finish()
}

// subqueryTimes returns the sum of offsets and ranges of all subqueries in the path.
// If the @ modifier is used, then the offset and range is w.r.t. that timestamp
// (i.e. the sum is reset when we have @ modifier).
// The returned *int64 is the closest timestamp that was seen. nil for no @ modifier.
func subqueryTimes(path []parser.Node) (time.Duration, time.Duration, *int64) {
        var (
                subqOffset, subqRange time.Duration
                ts                    int64 = math.MaxInt64
        )
        for _, node := range path {
                if n, ok := node.(*parser.SubqueryExpr); ok {
                        subqOffset += n.OriginalOffset
                        subqRange += n.Range
                        if n.Timestamp != nil {
                                // The @ modifier on subquery invalidates all the offset and
                                // range till now. Hence resetting it here.
                                subqOffset = n.OriginalOffset
                                subqRange = n.Range
                                ts = *n.Timestamp
                        }
                }
        }
        var tsp *int64
        if ts != math.MaxInt64 {
                tsp = &ts
        }
        return subqOffset, subqRange, tsp
}

// FindMinMaxTime returns the time in milliseconds of the earliest and latest point in time the statement will try to process.
// This takes into account offsets, @ modifiers, and range selectors.
// If the statement does not select series, then FindMinMaxTime returns (0, 0).
func FindMinMaxTime(s *parser.EvalStmt) (int64, int64) {
        var minTimestamp, maxTimestamp int64 = math.MaxInt64, math.MinInt64
        // Whenever a MatrixSelector is evaluated, evalRange is set to the corresponding range.
        // The evaluation of the VectorSelector inside then evaluates the given range and unsets
        // the variable.
        var evalRange time.Duration
        parser.Inspect(s.Expr, func(node parser.Node, path []parser.Node) error {
                switch n := node.(type) {
                case *parser.VectorSelector:
                        start, end := getTimeRangesForSelector(s, n, path, evalRange)
                        if start < minTimestamp {
                                minTimestamp = start
                        }
                        if end > maxTimestamp {
                                maxTimestamp = end
                        }
                        evalRange = 0
                case *parser.MatrixSelector:
                        evalRange = n.Range
                }
                return nil
        })

        if maxTimestamp == math.MinInt64 {
                // This happens when there was no selector. Hence no time range to select.
                minTimestamp = 0
                maxTimestamp = 0
        }

        return minTimestamp, maxTimestamp
}

func getTimeRangesForSelector(s *parser.EvalStmt, n *parser.VectorSelector, path []parser.Node, evalRange time.Duration) (int64, int64) {
        start, end := timestamp.FromTime(s.Start), timestamp.FromTime(s.End)
        subqOffset, subqRange, subqTs := subqueryTimes(path)

        if subqTs != nil {
                // The timestamp on the subquery overrides the eval statement time ranges.
                start = *subqTs
                end = *subqTs
        }

        if n.Timestamp != nil {
                // The timestamp on the selector overrides everything.
                start = *n.Timestamp
                end = *n.Timestamp
        } else {
                offsetMilliseconds := durationMilliseconds(subqOffset)
                start = start - offsetMilliseconds - durationMilliseconds(subqRange)
                end -= offsetMilliseconds
        }

        if evalRange == 0 {
                start -= durationMilliseconds(s.LookbackDelta)
        } else {
                // For all matrix queries we want to ensure that we have (end-start) + range selected
                // this way we have `range` data before the start time
                start -= durationMilliseconds(evalRange)
        }

        offsetMilliseconds := durationMilliseconds(n.OriginalOffset)
        start -= offsetMilliseconds
        end -= offsetMilliseconds

        return start, end
}

func (ng *Engine) getLastSubqueryInterval(path []parser.Node) time.Duration {
        var interval time.Duration
        for _, node := range path {
                if n, ok := node.(*parser.SubqueryExpr); ok {
                        interval = n.Step
                        if n.Step == 0 {
                                interval = time.Duration(ng.noStepSubqueryIntervalFn(durationMilliseconds(n.Range))) * time.Millisecond
                        }
                }
        }
        return interval
}

func (ng *Engine) populateSeries(ctx context.Context, querier storage.Querier, s *parser.EvalStmt) {
        // Whenever a MatrixSelector is evaluated, evalRange is set to the corresponding range.
        // The evaluation of the VectorSelector inside then evaluates the given range and unsets
        // the variable.
        var evalRange time.Duration

        parser.Inspect(s.Expr, func(node parser.Node, path []parser.Node) error {
                switch n := node.(type) {
                case *parser.VectorSelector:
                        start, end := getTimeRangesForSelector(s, n, path, evalRange)
                        interval := ng.getLastSubqueryInterval(path)
                        if interval == 0 {
                                interval = s.Interval
                        }
                        hints := &storage.SelectHints{
                                Start: start,
                                End:   end,
                                Step:  durationMilliseconds(interval),
                                Range: durationMilliseconds(evalRange),
                                Func:  extractFuncFromPath(path),
                        }
                        evalRange = 0
                        hints.By, hints.Grouping = extractGroupsFromPath(path)
                        n.UnexpandedSeriesSet = querier.Select(ctx, false, hints, n.LabelMatchers...)

                case *parser.MatrixSelector:
                        evalRange = n.Range
                }
                return nil
        })
}

// extractFuncFromPath walks up the path and searches for the first instance of
// a function or aggregation.
func extractFuncFromPath(p []parser.Node) string {
        if len(p) == 0 {
                return ""
        }
        switch n := p[len(p)-1].(type) {
        case *parser.AggregateExpr:
                return n.Op.String()
        case *parser.Call:
                return n.Func.Name
        case *parser.BinaryExpr:
                // If we hit a binary expression we terminate since we only care about functions
                // or aggregations over a single metric.
                return ""
        }
        return extractFuncFromPath(p[:len(p)-1])
}

// extractGroupsFromPath parses vector outer function and extracts grouping information if by or without was used.
func extractGroupsFromPath(p []parser.Node) (bool, []string) {
        if len(p) == 0 {
                return false, nil
        }
        if n, ok := p[len(p)-1].(*parser.AggregateExpr); ok {
                return !n.Without, n.Grouping
        }
        return false, nil
}

func checkAndExpandSeriesSet(ctx context.Context, expr parser.Expr) (annotations.Annotations, error) {
        switch e := expr.(type) {
        case *parser.MatrixSelector:
                return checkAndExpandSeriesSet(ctx, e.VectorSelector)
        case *parser.VectorSelector:
                if e.Series != nil {
                        return nil, nil
                }
                series, ws, err := expandSeriesSet(ctx, e.UnexpandedSeriesSet)
                if e.SkipHistogramBuckets {
                        for i := range series {
                                series[i] = newHistogramStatsSeries(series[i])
                        }
                }
                e.Series = series
                return ws, err
        }
        return nil, nil
}

func expandSeriesSet(ctx context.Context, it storage.SeriesSet) (res []storage.Series, ws annotations.Annotations, err error) {
        for it.Next() {
                select {
                case <-ctx.Done():
                        return nil, nil, ctx.Err()
                default:
                }
                res = append(res, it.At())
        }
        return res, it.Warnings(), it.Err()
}

type errWithWarnings struct {
        err      error
        warnings annotations.Annotations
}

func (e errWithWarnings) Error() string { return e.err.Error() }

// An evaluator evaluates the given expressions over the given fixed
// timestamps. It is attached to an engine through which it connects to a
// querier and reports errors. On timeout or cancellation of its context it
// terminates.
type evaluator struct {
        ctx context.Context

        startTimestamp int64 // Start time in milliseconds.
        endTimestamp   int64 // End time in milliseconds.
        interval       int64 // Interval in milliseconds.

        maxSamples               int
        currentSamples           int
        logger                   log.Logger
        lookbackDelta            time.Duration
        samplesStats             *stats.QuerySamples
        noStepSubqueryIntervalFn func(rangeMillis int64) int64
}

// errorf causes a panic with the input formatted into an error.
func (ev *evaluator) errorf(format string, args ...interface{}) {
        ev.error(fmt.Errorf(format, args...))
}

// error causes a panic with the given error.
func (ev *evaluator) error(err error) {
        panic(err)
}

// recover is the handler that turns panics into returns from the top level of evaluation.
func (ev *evaluator) recover(expr parser.Expr, ws *annotations.Annotations, errp *error) {
        e := recover()
        if e == nil {
                return
        }

        switch err := e.(type) {
        case runtime.Error:
                // Print the stack trace but do not inhibit the running application.
                buf := make([]byte, 64<<10)
                buf = buf[:runtime.Stack(buf, false)]

                level.Error(ev.logger).Log("msg", "runtime panic in parser", "expr", expr.String(), "err", e, "stacktrace", string(buf))
                *errp = fmt.Errorf("unexpected error: %w", err)
        case errWithWarnings:
                *errp = err.err
                ws.Merge(err.warnings)
        case error:
                *errp = err
        default:
                *errp = fmt.Errorf("%v", err)
        }
}

func (ev *evaluator) Eval(expr parser.Expr) (v parser.Value, ws annotations.Annotations, err error) {
        defer ev.recover(expr, &ws, &err)

        v, ws = ev.eval(expr)
        return v, ws, nil
}

// EvalSeriesHelper stores extra information about a series.
type EvalSeriesHelper struct {
        // Used to map left-hand to right-hand in binary operations.
        signature string
}

// EvalNodeHelper stores extra information and caches for evaluating a single node across steps.
type EvalNodeHelper struct {
        // Evaluation timestamp.
        Ts int64
        // Vector that can be used for output.
        Out Vector

        // Caches.
        // funcHistogramQuantile for classic histograms.
        signatureToMetricWithBuckets map[string]*metricWithBuckets

        lb           *labels.Builder
        lblBuf       []byte
        lblResultBuf []byte

        // For binary vector matching.
        rightSigs    map[string]Sample
        matchedSigs  map[string]map[uint64]struct{}
        resultMetric map[string]labels.Labels
}

func (enh *EvalNodeHelper) resetBuilder(lbls labels.Labels) {
        if enh.lb == nil {
                enh.lb = labels.NewBuilder(lbls)
        } else {
                enh.lb.Reset(lbls)
        }
}

// rangeEval evaluates the given expressions, and then for each step calls
// the given funcCall with the values computed for each expression at that
// step. The return value is the combination into time series of all the
// function call results.
// The prepSeries function (if provided) can be used to prepare the helper
// for each series, then passed to each call funcCall.
func (ev *evaluator) rangeEval(prepSeries func(labels.Labels, *EvalSeriesHelper), funcCall func([]parser.Value, [][]EvalSeriesHelper, *EvalNodeHelper) (Vector, annotations.Annotations), exprs ...parser.Expr) (Matrix, annotations.Annotations) {
        numSteps := int((ev.endTimestamp-ev.startTimestamp)/ev.interval) + 1
        matrixes := make([]Matrix, len(exprs))
        origMatrixes := make([]Matrix, len(exprs))
        originalNumSamples := ev.currentSamples

        var warnings annotations.Annotations
        for i, e := range exprs {
                // Functions will take string arguments from the expressions, not the values.
                if e != nil && e.Type() != parser.ValueTypeString {
                        // ev.currentSamples will be updated to the correct value within the ev.eval call.
                        val, ws := ev.eval(e)
                        warnings.Merge(ws)
                        matrixes[i] = val.(Matrix)

                        // Keep a copy of the original point slices so that they
                        // can be returned to the pool.
                        origMatrixes[i] = make(Matrix, len(matrixes[i]))
                        copy(origMatrixes[i], matrixes[i])
                }
        }

        vectors := make([]Vector, len(exprs))    // Input vectors for the function.
        args := make([]parser.Value, len(exprs)) // Argument to function.
        // Create an output vector that is as big as the input matrix with
        // the most time series.
        biggestLen := 1
        for i := range exprs {
                vectors[i] = make(Vector, 0, len(matrixes[i]))
                if len(matrixes[i]) > biggestLen {
                        biggestLen = len(matrixes[i])
                }
        }
        enh := &EvalNodeHelper{Out: make(Vector, 0, biggestLen)}
        type seriesAndTimestamp struct {
                Series
                ts int64
        }
        seriess := make(map[uint64]seriesAndTimestamp, biggestLen) // Output series by series hash.
        tempNumSamples := ev.currentSamples

        var (
                seriesHelpers [][]EvalSeriesHelper
                bufHelpers    [][]EvalSeriesHelper // Buffer updated on each step
        )

        // If the series preparation function is provided, we should run it for
        // every single series in the matrix.
        if prepSeries != nil {
                seriesHelpers = make([][]EvalSeriesHelper, len(exprs))
                bufHelpers = make([][]EvalSeriesHelper, len(exprs))

                for i := range exprs {
                        seriesHelpers[i] = make([]EvalSeriesHelper, len(matrixes[i]))
                        bufHelpers[i] = make([]EvalSeriesHelper, len(matrixes[i]))

                        for si, series := range matrixes[i] {
                                prepSeries(series.Metric, &seriesHelpers[i][si])
                        }
                }
        }

        for ts := ev.startTimestamp; ts <= ev.endTimestamp; ts += ev.interval {
                if err := contextDone(ev.ctx, "expression evaluation"); err != nil {
                        ev.error(err)
                }
                // Reset number of samples in memory after each timestamp.
                ev.currentSamples = tempNumSamples
                // Gather input vectors for this timestamp.
                for i := range exprs {
                        vectors[i] = vectors[i][:0]

                        if prepSeries != nil {
                                bufHelpers[i] = bufHelpers[i][:0]
                        }

                        for si, series := range matrixes[i] {
                                switch {
                                case len(series.Floats) > 0 && series.Floats[0].T == ts:
                                        vectors[i] = append(vectors[i], Sample{Metric: series.Metric, F: series.Floats[0].F, T: ts})
                                        // Move input vectors forward so we don't have to re-scan the same
                                        // past points at the next step.
                                        matrixes[i][si].Floats = series.Floats[1:]
                                case len(series.Histograms) > 0 && series.Histograms[0].T == ts:
                                        vectors[i] = append(vectors[i], Sample{Metric: series.Metric, H: series.Histograms[0].H, T: ts})
                                        matrixes[i][si].Histograms = series.Histograms[1:]
                                default:
                                        continue
                                }
                                if prepSeries != nil {
                                        bufHelpers[i] = append(bufHelpers[i], seriesHelpers[i][si])
                                }
                                // Don't add histogram size here because we only
                                // copy the pointer above, not the whole
                                // histogram.
                                ev.currentSamples++
                                if ev.currentSamples > ev.maxSamples {
                                        ev.error(ErrTooManySamples(env))
                                }
                        }
                        args[i] = vectors[i]
                        ev.samplesStats.UpdatePeak(ev.currentSamples)
                }

                // Make the function call.
                enh.Ts = ts
                result, ws := funcCall(args, bufHelpers, enh)
                enh.Out = result[:0] // Reuse result vector.
                warnings.Merge(ws)

                vecNumSamples := result.TotalSamples()
                ev.currentSamples += vecNumSamples
                // When we reset currentSamples to tempNumSamples during the next iteration of the loop it also
                // needs to include the samples from the result here, as they're still in memory.
                tempNumSamples += vecNumSamples
                ev.samplesStats.UpdatePeak(ev.currentSamples)

                if ev.currentSamples > ev.maxSamples {
                        ev.error(ErrTooManySamples(env))
                }

                // If this could be an instant query, shortcut so as not to change sort order.
                if ev.endTimestamp == ev.startTimestamp {
                        if result.ContainsSameLabelset() {
                                ev.errorf("vector cannot contain metrics with the same labelset")
                        }
                        mat := make(Matrix, len(result))
                        for i, s := range result {
                                if s.H == nil {
                                        mat[i] = Series{Metric: s.Metric, Floats: []FPoint{{T: ts, F: s.F}}}
                                } else {
                                        mat[i] = Series{Metric: s.Metric, Histograms: []HPoint{{T: ts, H: s.H}}}
                                }
                        }
                        ev.currentSamples = originalNumSamples + mat.TotalSamples()
                        ev.samplesStats.UpdatePeak(ev.currentSamples)
                        return mat, warnings
                }

                // Add samples in output vector to output series.
                for _, sample := range result {
                        h := sample.Metric.Hash()
                        ss, ok := seriess[h]
                        if ok {
                                if ss.ts == ts { // If we've seen this output series before at this timestamp, it's a duplicate.
                                        ev.errorf("vector cannot contain metrics with the same labelset")
                                }
                                ss.ts = ts
                        } else {
                                ss = seriesAndTimestamp{Series{Metric: sample.Metric}, ts}
                        }
                        addToSeries(&ss.Series, enh.Ts, sample.F, sample.H, numSteps)
                        seriess[h] = ss
                }
        }

        // Reuse the original point slices.
        for _, m := range origMatrixes {
                for _, s := range m {
                        putFPointSlice(s.Floats)
                        putHPointSlice(s.Histograms)
                }
        }
        // Assemble the output matrix. By the time we get here we know we don't have too many samples.
        mat := make(Matrix, 0, len(seriess))
        for _, ss := range seriess {
                mat = append(mat, ss.Series)
        }
        ev.currentSamples = originalNumSamples + mat.TotalSamples()
        ev.samplesStats.UpdatePeak(ev.currentSamples)
        return mat, warnings
}

func (ev *evaluator) rangeEvalAgg(aggExpr *parser.AggregateExpr, sortedGrouping []string, inputMatrix Matrix, param float64) (Matrix, annotations.Annotations) {
        // Keep a copy of the original point slice so that it can be returned to the pool.
        origMatrix := slices.Clone(inputMatrix)
        defer func() {
                for _, s := range origMatrix {
                        putFPointSlice(s.Floats)
                        putHPointSlice(s.Histograms)
                }
        }()

        var warnings annotations.Annotations

        enh := &EvalNodeHelper{}
        tempNumSamples := ev.currentSamples

        // Create a mapping from input series to output groups.
        buf := make([]byte, 0, 1024)
        groupToResultIndex := make(map[uint64]int)
        seriesToResult := make([]int, len(inputMatrix))
        var result Matrix

        groupCount := 0
        for si, series := range inputMatrix {
                var groupingKey uint64
                groupingKey, buf = generateGroupingKey(series.Metric, sortedGrouping, aggExpr.Without, buf)
                index, ok := groupToResultIndex[groupingKey]
                // Add a new group if it doesn't exist.
                if !ok {
                        if aggExpr.Op != parser.TOPK && aggExpr.Op != parser.BOTTOMK {
                                m := generateGroupingLabels(enh, series.Metric, aggExpr.Without, sortedGrouping)
                                result = append(result, Series{Metric: m})
                        }
                        index = groupCount
                        groupToResultIndex[groupingKey] = index
                        groupCount++
                }
                seriesToResult[si] = index
        }
        groups := make([]groupedAggregation, groupCount)

        var k int
        var seriess map[uint64]Series
        switch aggExpr.Op {
        case parser.TOPK, parser.BOTTOMK:
                if !convertibleToInt64(param) {
                        ev.errorf("Scalar value %v overflows int64", param)
                }
                k = int(param)
                if k > len(inputMatrix) {
                        k = len(inputMatrix)
                }
                if k < 1 {
                        return nil, warnings
                }
                seriess = make(map[uint64]Series, len(inputMatrix)) // Output series by series hash.
        case parser.QUANTILE:
                if math.IsNaN(param) || param < 0 || param > 1 {
                        warnings.Add(annotations.NewInvalidQuantileWarning(param, aggExpr.Param.PositionRange()))
                }
        }

        for ts := ev.startTimestamp; ts <= ev.endTimestamp; ts += ev.interval {
                if err := contextDone(ev.ctx, "expression evaluation"); err != nil {
                        ev.error(err)
                }
                // Reset number of samples in memory after each timestamp.
                ev.currentSamples = tempNumSamples

                // Make the function call.
                enh.Ts = ts
                var ws annotations.Annotations
                switch aggExpr.Op {
                case parser.TOPK, parser.BOTTOMK:
                        result, ws = ev.aggregationK(aggExpr, k, inputMatrix, seriesToResult, groups, enh, seriess)
                        // If this could be an instant query, shortcut so as not to change sort order.
                        if ev.endTimestamp == ev.startTimestamp {
                                return result, ws
                        }
                default:
                        ws = ev.aggregation(aggExpr, param, inputMatrix, result, seriesToResult, groups, enh)
                }

                warnings.Merge(ws)

                if ev.currentSamples > ev.maxSamples {
                        ev.error(ErrTooManySamples(env))
                }
        }

        // Assemble the output matrix. By the time we get here we know we don't have too many samples.
        switch aggExpr.Op {
        case parser.TOPK, parser.BOTTOMK:
                result = make(Matrix, 0, len(seriess))
                for _, ss := range seriess {
                        result = append(result, ss)
                }
        default:
                // Remove empty result rows.
                dst := 0
                for _, series := range result {
                        if len(series.Floats) > 0 || len(series.Histograms) > 0 {
                                result[dst] = series
                                dst++
                        }
                }
                result = result[:dst]
        }
        return result, warnings
}

// evalSubquery evaluates given SubqueryExpr and returns an equivalent
// evaluated MatrixSelector in its place. Note that the Name and LabelMatchers are not set.
func (ev *evaluator) evalSubquery(subq *parser.SubqueryExpr) (*parser.MatrixSelector, int, annotations.Annotations) {
        samplesStats := ev.samplesStats
        // Avoid double counting samples when running a subquery, those samples will be counted in later stage.
        ev.samplesStats = ev.samplesStats.NewChild()
        val, ws := ev.eval(subq)
        // But do incorporate the peak from the subquery
        samplesStats.UpdatePeakFromSubquery(ev.samplesStats)
        ev.samplesStats = samplesStats
        mat := val.(Matrix)
        vs := &parser.VectorSelector{
                OriginalOffset: subq.OriginalOffset,
                Offset:         subq.Offset,
                Series:         make([]storage.Series, 0, len(mat)),
                Timestamp:      subq.Timestamp,
        }
        if subq.Timestamp != nil {
                // The offset of subquery is not modified in case of @ modifier.
                // Hence we take care of that here for the result.
                vs.Offset = subq.OriginalOffset + time.Duration(ev.startTimestamp-*subq.Timestamp)*time.Millisecond
        }
        ms := &parser.MatrixSelector{
                Range:          subq.Range,
                VectorSelector: vs,
        }
        for _, s := range mat {
                vs.Series = append(vs.Series, NewStorageSeries(s))
        }
        return ms, mat.TotalSamples(), ws
}

// eval evaluates the given expression as the given AST expression node requires.
func (ev *evaluator) eval(expr parser.Expr) (parser.Value, annotations.Annotations) {
        // This is the top-level evaluation method.
        // Thus, we check for timeout/cancellation here.
        if err := contextDone(ev.ctx, "expression evaluation"); err != nil {
                ev.error(err)
        }
        numSteps := int((ev.endTimestamp-ev.startTimestamp)/ev.interval) + 1

        // Create a new span to help investigate inner evaluation performances.
        ctxWithSpan, span := otel.Tracer("").Start(ev.ctx, stats.InnerEvalTime.SpanOperation()+" eval "+reflect.TypeOf(expr).String())
        ev.ctx = ctxWithSpan
        defer span.End()

        switch e := expr.(type) {
        case *parser.AggregateExpr:
                // Grouping labels must be sorted (expected both by generateGroupingKey() and aggregation()).
                sortedGrouping := e.Grouping
                slices.Sort(sortedGrouping)

                unwrapParenExpr(&e.Param)
                param := unwrapStepInvariantExpr(e.Param)
                unwrapParenExpr(&param)

                if e.Op == parser.COUNT_VALUES {
                        valueLabel := param.(*parser.StringLiteral)
                        if !model.LabelName(valueLabel.Val).IsValid() {
                                ev.errorf("invalid label name %q", valueLabel)
                        }
                        if !e.Without {
                                sortedGrouping = append(sortedGrouping, valueLabel.Val)
                                slices.Sort(sortedGrouping)
                        }
                        return ev.rangeEval(nil, func(v []parser.Value, _ [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                                return ev.aggregationCountValues(e, sortedGrouping, valueLabel.Val, v[0].(Vector), enh)
                        }, e.Expr)
                }

                var warnings annotations.Annotations
                originalNumSamples := ev.currentSamples
                // param is the number k for topk/bottomk, or q for quantile.
                var fParam float64
                if param != nil {
                        val, ws := ev.eval(param)
                        warnings.Merge(ws)
                        fParam = val.(Matrix)[0].Floats[0].F
                }
                // Now fetch the data to be aggregated.
                val, ws := ev.eval(e.Expr)
                warnings.Merge(ws)
                inputMatrix := val.(Matrix)

                result, ws := ev.rangeEvalAgg(e, sortedGrouping, inputMatrix, fParam)
                warnings.Merge(ws)
                ev.currentSamples = originalNumSamples + result.TotalSamples()
                ev.samplesStats.UpdatePeak(ev.currentSamples)

                return result, warnings

        case *parser.Call:
                call := FunctionCalls[e.Func.Name]
                if e.Func.Name == "timestamp" {
                        // Matrix evaluation always returns the evaluation time,
                        // so this function needs special handling when given
                        // a vector selector.
                        unwrapParenExpr(&e.Args[0])
                        arg := unwrapStepInvariantExpr(e.Args[0])
                        unwrapParenExpr(&arg)
                        vs, ok := arg.(*parser.VectorSelector)
                        if ok {
                                return ev.rangeEvalTimestampFunctionOverVectorSelector(vs, call, e)
                        }
                }

                // Check if the function has a matrix argument.
                var (
                        matrixArgIndex int
                        matrixArg      bool
                        warnings       annotations.Annotations
                )
                for i := range e.Args {
                        unwrapParenExpr(&e.Args[i])
                        a := unwrapStepInvariantExpr(e.Args[i])
                        unwrapParenExpr(&a)
                        if _, ok := a.(*parser.MatrixSelector); ok {
                                matrixArgIndex = i
                                matrixArg = true
                                break
                        }
                        // parser.SubqueryExpr can be used in place of parser.MatrixSelector.
                        if subq, ok := a.(*parser.SubqueryExpr); ok {
                                matrixArgIndex = i
                                matrixArg = true
                                // Replacing parser.SubqueryExpr with parser.MatrixSelector.
                                val, totalSamples, ws := ev.evalSubquery(subq)
                                e.Args[i] = val
                                warnings.Merge(ws)
                                defer func() {
                                        // subquery result takes space in the memory. Get rid of that at the end.
                                        val.VectorSelector.(*parser.VectorSelector).Series = nil
                                        ev.currentSamples -= totalSamples
                                }()
                                break
                        }
                }

                // Special handling for functions that work on series not samples.
                switch e.Func.Name {
                case "label_replace":
                        return ev.evalLabelReplace(e.Args)
                case "label_join":
                        return ev.evalLabelJoin(e.Args)
                }

                if !matrixArg {
                        // Does not have a matrix argument.
                        return ev.rangeEval(nil, func(v []parser.Value, _ [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                                vec, annos := call(v, e.Args, enh)
                                return vec, warnings.Merge(annos)
                        }, e.Args...)
                }

                inArgs := make([]parser.Value, len(e.Args))
                // Evaluate any non-matrix arguments.
                otherArgs := make([]Matrix, len(e.Args))
                otherInArgs := make([]Vector, len(e.Args))
                for i, e := range e.Args {
                        if i != matrixArgIndex {
                                val, ws := ev.eval(e)
                                otherArgs[i] = val.(Matrix)
                                otherInArgs[i] = Vector{Sample{}}
                                inArgs[i] = otherInArgs[i]
                                warnings.Merge(ws)
                        }
                }

                unwrapParenExpr(&e.Args[matrixArgIndex])
                arg := unwrapStepInvariantExpr(e.Args[matrixArgIndex])
                unwrapParenExpr(&arg)
                sel := arg.(*parser.MatrixSelector)
                selVS := sel.VectorSelector.(*parser.VectorSelector)

                ws, err := checkAndExpandSeriesSet(ev.ctx, sel)
                warnings.Merge(ws)
                if err != nil {
                        ev.error(errWithWarnings{fmt.Errorf("expanding series: %w", err), warnings})
                }
                mat := make(Matrix, 0, len(selVS.Series)) // Output matrix.
                offset := durationMilliseconds(selVS.Offset)
                selRange := durationMilliseconds(sel.Range)
                stepRange := selRange
                if stepRange > ev.interval {
                        stepRange = ev.interval
                }
                // Reuse objects across steps to save memory allocations.
                var floats []FPoint
                var histograms []HPoint
                var prevSS *Series
                inMatrix := make(Matrix, 1)
                inArgs[matrixArgIndex] = inMatrix
                enh := &EvalNodeHelper{Out: make(Vector, 0, 1)}
                // Process all the calls for one time series at a time.
                it := storage.NewBuffer(selRange)
                var chkIter chunkenc.Iterator
                for i, s := range selVS.Series {
                        if err := contextDone(ev.ctx, "expression evaluation"); err != nil {
                                ev.error(err)
                        }
                        ev.currentSamples -= len(floats) + totalHPointSize(histograms)
                        if floats != nil {
                                floats = floats[:0]
                        }
                        if histograms != nil {
                                histograms = histograms[:0]
                        }
                        chkIter = s.Iterator(chkIter)
                        it.Reset(chkIter)
                        metric := selVS.Series[i].Labels()
                        // The last_over_time function acts like offset; thus, it
                        // should keep the metric name.  For all the other range
                        // vector functions, the only change needed is to drop the
                        // metric name in the output.
                        if e.Func.Name != "last_over_time" {
                                metric = metric.DropMetricName()
                        }
                        ss := Series{
                                Metric: metric,
                        }
                        inMatrix[0].Metric = selVS.Series[i].Labels()
                        for ts, step := ev.startTimestamp, -1; ts <= ev.endTimestamp; ts += ev.interval {
                                step++
                                // Set the non-matrix arguments.
                                // They are scalar, so it is safe to use the step number
                                // when looking up the argument, as there will be no gaps.
                                for j := range e.Args {
                                        if j != matrixArgIndex {
                                                otherInArgs[j][0].F = otherArgs[j][0].Floats[step].F
                                        }
                                }
                                // Evaluate the matrix selector for this series
                                // for this step, but only if this is the 1st
                                // iteration or no @ modifier has been used.
                                if ts == ev.startTimestamp || selVS.Timestamp == nil {
                                        maxt := ts - offset
                                        mint := maxt - selRange
                                        floats, histograms = ev.matrixIterSlice(it, mint, maxt, floats, histograms)
                                }
                                if len(floats)+len(histograms) == 0 {
                                        continue
                                }
                                inMatrix[0].Floats = floats
                                inMatrix[0].Histograms = histograms
                                enh.Ts = ts
                                // Make the function call.
                                outVec, annos := call(inArgs, e.Args, enh)
                                warnings.Merge(annos)
                                ev.samplesStats.IncrementSamplesAtStep(step, int64(len(floats)+totalHPointSize(histograms)))

                                enh.Out = outVec[:0]
                                if len(outVec) > 0 {
                                        if outVec[0].H == nil {
                                                if ss.Floats == nil {
                                                        ss.Floats = reuseOrGetFPointSlices(prevSS, numSteps)
                                                }

                                                ss.Floats = append(ss.Floats, FPoint{F: outVec[0].F, T: ts})
                                        } else {
                                                if ss.Histograms == nil {
                                                        ss.Histograms = reuseOrGetHPointSlices(prevSS, numSteps)
                                                }
                                                ss.Histograms = append(ss.Histograms, HPoint{H: outVec[0].H, T: ts})
                                        }
                                }
                                // Only buffer stepRange milliseconds from the second step on.
                                it.ReduceDelta(stepRange)
                        }
                        histSamples := totalHPointSize(ss.Histograms)

                        if len(ss.Floats)+histSamples > 0 {
                                if ev.currentSamples+len(ss.Floats)+histSamples > ev.maxSamples {
                                        ev.error(ErrTooManySamples(env))
                                }
                                mat = append(mat, ss)
                                prevSS = &mat[len(mat)-1]
                                ev.currentSamples += len(ss.Floats) + histSamples
                        }
                        ev.samplesStats.UpdatePeak(ev.currentSamples)

                        if e.Func.Name == "rate" || e.Func.Name == "increase" {
                                samples := inMatrix[0]
                                metricName := samples.Metric.Get(labels.MetricName)
                                if metricName != "" && len(samples.Floats) > 0 &&
                                        !strings.HasSuffix(metricName, "_total") &&
                                        !strings.HasSuffix(metricName, "_sum") &&
                                        !strings.HasSuffix(metricName, "_count") &&
                                        !strings.HasSuffix(metricName, "_bucket") {
                                        warnings.Add(annotations.NewPossibleNonCounterInfo(metricName, e.Args[0].PositionRange()))
                                }
                        }
                }
                ev.samplesStats.UpdatePeak(ev.currentSamples)

                ev.currentSamples -= len(floats) + totalHPointSize(histograms)
                putFPointSlice(floats)
                putMatrixSelectorHPointSlice(histograms)

                // The absent_over_time function returns 0 or 1 series. So far, the matrix
                // contains multiple series. The following code will create a new series
                // with values of 1 for the timestamps where no series has value.
                if e.Func.Name == "absent_over_time" {
                        steps := int(1 + (ev.endTimestamp-ev.startTimestamp)/ev.interval)
                        // Iterate once to look for a complete series.
                        for _, s := range mat {
                                if len(s.Floats)+len(s.Histograms) == steps {
                                        return Matrix{}, warnings
                                }
                        }

                        found := map[int64]struct{}{}

                        for i, s := range mat {
                                for _, p := range s.Floats {
                                        found[p.T] = struct{}{}
                                }
                                for _, p := range s.Histograms {
                                        found[p.T] = struct{}{}
                                }
                                if i > 0 && len(found) == steps {
                                        return Matrix{}, warnings
                                }
                        }

                        newp := make([]FPoint, 0, steps-len(found))
                        for ts := ev.startTimestamp; ts <= ev.endTimestamp; ts += ev.interval {
                                if _, ok := found[ts]; !ok {
                                        newp = append(newp, FPoint{T: ts, F: 1})
                                }
                        }

                        return Matrix{
                                Series{
                                        Metric: createLabelsForAbsentFunction(e.Args[0]),
                                        Floats: newp,
                                },
                        }, warnings
                }

                if mat.ContainsSameLabelset() {
                        ev.errorf("vector cannot contain metrics with the same labelset")
                }

                return mat, warnings

        case *parser.ParenExpr:
                return ev.eval(e.Expr)

        case *parser.UnaryExpr:
                val, ws := ev.eval(e.Expr)
                mat := val.(Matrix)
                if e.Op == parser.SUB {
                        for i := range mat {
                                mat[i].Metric = mat[i].Metric.DropMetricName()
                                for j := range mat[i].Floats {
                                        mat[i].Floats[j].F = -mat[i].Floats[j].F
                                }
                        }
                        if mat.ContainsSameLabelset() {
                                ev.errorf("vector cannot contain metrics with the same labelset")
                        }
                }
                return mat, ws

        case *parser.BinaryExpr:
                switch lt, rt := e.LHS.Type(), e.RHS.Type(); {
                case lt == parser.ValueTypeScalar && rt == parser.ValueTypeScalar:
                        return ev.rangeEval(nil, func(v []parser.Value, _ [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                                val := scalarBinop(e.Op, v[0].(Vector)[0].F, v[1].(Vector)[0].F)
                                return append(enh.Out, Sample{F: val}), nil
                        }, e.LHS, e.RHS)
                case lt == parser.ValueTypeVector && rt == parser.ValueTypeVector:
                        // Function to compute the join signature for each series.
                        buf := make([]byte, 0, 1024)
                        sigf := signatureFunc(e.VectorMatching.On, buf, e.VectorMatching.MatchingLabels...)
                        initSignatures := func(series labels.Labels, h *EvalSeriesHelper) {
                                h.signature = sigf(series)
                        }
                        switch e.Op {
                        case parser.LAND:
                                return ev.rangeEval(initSignatures, func(v []parser.Value, sh [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                                        return ev.VectorAnd(v[0].(Vector), v[1].(Vector), e.VectorMatching, sh[0], sh[1], enh), nil
                                }, e.LHS, e.RHS)
                        case parser.LOR:
                                return ev.rangeEval(initSignatures, func(v []parser.Value, sh [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                                        return ev.VectorOr(v[0].(Vector), v[1].(Vector), e.VectorMatching, sh[0], sh[1], enh), nil
                                }, e.LHS, e.RHS)
                        case parser.LUNLESS:
                                return ev.rangeEval(initSignatures, func(v []parser.Value, sh [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                                        return ev.VectorUnless(v[0].(Vector), v[1].(Vector), e.VectorMatching, sh[0], sh[1], enh), nil
                                }, e.LHS, e.RHS)
                        default:
                                return ev.rangeEval(initSignatures, func(v []parser.Value, sh [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                                        vec, err := ev.VectorBinop(e.Op, v[0].(Vector), v[1].(Vector), e.VectorMatching, e.ReturnBool, sh[0], sh[1], enh)
                                        return vec, handleVectorBinopError(err, e)
                                }, e.LHS, e.RHS)
                        }

                case lt == parser.ValueTypeVector && rt == parser.ValueTypeScalar:
                        return ev.rangeEval(nil, func(v []parser.Value, _ [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                                vec, err := ev.VectorscalarBinop(e.Op, v[0].(Vector), Scalar{V: v[1].(Vector)[0].F}, false, e.ReturnBool, enh)
                                return vec, handleVectorBinopError(err, e)
                        }, e.LHS, e.RHS)

                case lt == parser.ValueTypeScalar && rt == parser.ValueTypeVector:
                        return ev.rangeEval(nil, func(v []parser.Value, _ [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                                vec, err := ev.VectorscalarBinop(e.Op, v[1].(Vector), Scalar{V: v[0].(Vector)[0].F}, true, e.ReturnBool, enh)
                                return vec, handleVectorBinopError(err, e)
                        }, e.LHS, e.RHS)
                }

        case *parser.NumberLiteral:
                return ev.rangeEval(nil, func(v []parser.Value, _ [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                        return append(enh.Out, Sample{F: e.Val, Metric: labels.EmptyLabels()}), nil
                })

        case *parser.StringLiteral:
                return String{V: e.Val, T: ev.startTimestamp}, nil

        case *parser.VectorSelector:
                ws, err := checkAndExpandSeriesSet(ev.ctx, e)
                if err != nil {
                        ev.error(errWithWarnings{fmt.Errorf("expanding series: %w", err), ws})
                }
                mat := make(Matrix, 0, len(e.Series))
                var prevSS *Series
                it := storage.NewMemoizedEmptyIterator(durationMilliseconds(ev.lookbackDelta))
                var chkIter chunkenc.Iterator
                for i, s := range e.Series {
                        if err := contextDone(ev.ctx, "expression evaluation"); err != nil {
                                ev.error(err)
                        }
                        chkIter = s.Iterator(chkIter)
                        it.Reset(chkIter)
                        ss := Series{
                                Metric: e.Series[i].Labels(),
                        }

                        for ts, step := ev.startTimestamp, -1; ts <= ev.endTimestamp; ts += ev.interval {
                                step++
                                _, f, h, ok := ev.vectorSelectorSingle(it, e, ts)
                                if ok {
                                        if h == nil {
                                                ev.currentSamples++
                                                ev.samplesStats.IncrementSamplesAtStep(step, 1)
                                                if ev.currentSamples > ev.maxSamples {
                                                        ev.error(ErrTooManySamples(env))
                                                }
                                                if ss.Floats == nil {
                                                        ss.Floats = reuseOrGetFPointSlices(prevSS, numSteps)
                                                }
                                                ss.Floats = append(ss.Floats, FPoint{F: f, T: ts})
                                        } else {
                                                point := HPoint{H: h, T: ts}
                                                histSize := point.size()
                                                ev.currentSamples += histSize
                                                ev.samplesStats.IncrementSamplesAtStep(step, int64(histSize))
                                                if ev.currentSamples > ev.maxSamples {
                                                        ev.error(ErrTooManySamples(env))
                                                }
                                                if ss.Histograms == nil {
                                                        ss.Histograms = reuseOrGetHPointSlices(prevSS, numSteps)
                                                }
                                                ss.Histograms = append(ss.Histograms, point)
                                        }
                                }
                        }

                        if len(ss.Floats)+len(ss.Histograms) > 0 {
                                mat = append(mat, ss)
                                prevSS = &mat[len(mat)-1]
                        }
                }
                ev.samplesStats.UpdatePeak(ev.currentSamples)
                return mat, ws

        case *parser.MatrixSelector:
                if ev.startTimestamp != ev.endTimestamp {
                        panic(errors.New("cannot do range evaluation of matrix selector"))
                }
                return ev.matrixSelector(e)

        case *parser.SubqueryExpr:
                offsetMillis := durationMilliseconds(e.Offset)
                rangeMillis := durationMilliseconds(e.Range)
                newEv := &evaluator{
                        endTimestamp:             ev.endTimestamp - offsetMillis,
                        ctx:                      ev.ctx,
                        currentSamples:           ev.currentSamples,
                        maxSamples:               ev.maxSamples,
                        logger:                   ev.logger,
                        lookbackDelta:            ev.lookbackDelta,
                        samplesStats:             ev.samplesStats.NewChild(),
                        noStepSubqueryIntervalFn: ev.noStepSubqueryIntervalFn,
                }

                if e.Step != 0 {
                        newEv.interval = durationMilliseconds(e.Step)
                } else {
                        newEv.interval = ev.noStepSubqueryIntervalFn(rangeMillis)
                }

                // Start with the first timestamp after (ev.startTimestamp - offset - range)
                // that is aligned with the step (multiple of 'newEv.interval').
                newEv.startTimestamp = newEv.interval * ((ev.startTimestamp - offsetMillis - rangeMillis) / newEv.interval)
                if newEv.startTimestamp < (ev.startTimestamp - offsetMillis - rangeMillis) {
                        newEv.startTimestamp += newEv.interval
                }

                if newEv.startTimestamp != ev.startTimestamp {
                        // Adjust the offset of selectors based on the new
                        // start time of the evaluator since the calculation
                        // of the offset with @ happens w.r.t. the start time.
                        setOffsetForAtModifier(newEv.startTimestamp, e.Expr)
                }

                res, ws := newEv.eval(e.Expr)
                ev.currentSamples = newEv.currentSamples
                ev.samplesStats.UpdatePeakFromSubquery(newEv.samplesStats)
                ev.samplesStats.IncrementSamplesAtTimestamp(ev.endTimestamp, newEv.samplesStats.TotalSamples)
                return res, ws
        case *parser.StepInvariantExpr:
                switch ce := e.Expr.(type) {
                case *parser.StringLiteral, *parser.NumberLiteral:
                        return ev.eval(ce)
                }

                newEv := &evaluator{
                        startTimestamp:           ev.startTimestamp,
                        endTimestamp:             ev.startTimestamp, // Always a single evaluation.
                        interval:                 ev.interval,
                        ctx:                      ev.ctx,
                        currentSamples:           ev.currentSamples,
                        maxSamples:               ev.maxSamples,
                        logger:                   ev.logger,
                        lookbackDelta:            ev.lookbackDelta,
                        samplesStats:             ev.samplesStats.NewChild(),
                        noStepSubqueryIntervalFn: ev.noStepSubqueryIntervalFn,
                }
                res, ws := newEv.eval(e.Expr)
                ev.currentSamples = newEv.currentSamples
                ev.samplesStats.UpdatePeakFromSubquery(newEv.samplesStats)
                for ts, step := ev.startTimestamp, -1; ts <= ev.endTimestamp; ts += ev.interval {
                        step++
                        ev.samplesStats.IncrementSamplesAtStep(step, newEv.samplesStats.TotalSamples)
                }
                switch e.Expr.(type) {
                case *parser.MatrixSelector, *parser.SubqueryExpr:
                        // We do not duplicate results for range selectors since result is a matrix
                        // with their unique timestamps which does not depend on the step.
                        return res, ws
                }

                // For every evaluation while the value remains same, the timestamp for that
                // value would change for different eval times. Hence we duplicate the result
                // with changed timestamps.
                mat, ok := res.(Matrix)
                if !ok {
                        panic(fmt.Errorf("unexpected result in StepInvariantExpr evaluation: %T", expr))
                }
                for i := range mat {
                        if len(mat[i].Floats)+len(mat[i].Histograms) != 1 {
                                panic(fmt.Errorf("unexpected number of samples"))
                        }
                        for ts := ev.startTimestamp + ev.interval; ts <= ev.endTimestamp; ts += ev.interval {
                                if len(mat[i].Floats) > 0 {
                                        mat[i].Floats = append(mat[i].Floats, FPoint{
                                                T: ts,
                                                F: mat[i].Floats[0].F,
                                        })
                                        ev.currentSamples++
                                } else {
                                        point := HPoint{
                                                T: ts,
                                                H: mat[i].Histograms[0].H,
                                        }
                                        mat[i].Histograms = append(mat[i].Histograms, point)
                                        ev.currentSamples += point.size()
                                }
                                if ev.currentSamples > ev.maxSamples {
                                        ev.error(ErrTooManySamples(env))
                                }
                        }
                }
                ev.samplesStats.UpdatePeak(ev.currentSamples)
                return res, ws
        }

        panic(fmt.Errorf("unhandled expression of type: %T", expr))
}

// reuseOrGetHPointSlices reuses the space from previous slice to create new slice if the former has lots of room.
// The previous slices capacity is adjusted so when it is re-used from the pool it doesn't overflow into the new one.
func reuseOrGetHPointSlices(prevSS *Series, numSteps int) (r []HPoint) {
        if prevSS != nil && cap(prevSS.Histograms)-2*len(prevSS.Histograms) > 0 {
                r = prevSS.Histograms[len(prevSS.Histograms):]
                prevSS.Histograms = prevSS.Histograms[0:len(prevSS.Histograms):len(prevSS.Histograms)]
                return
        }

        return getHPointSlice(numSteps)
}

// reuseOrGetFPointSlices reuses the space from previous slice to create new slice if the former has lots of room.
// The previous slices capacity is adjusted so when it is re-used from the pool it doesn't overflow into the new one.
func reuseOrGetFPointSlices(prevSS *Series, numSteps int) (r []FPoint) {
        if prevSS != nil && cap(prevSS.Floats)-2*len(prevSS.Floats) > 0 {
                r = prevSS.Floats[len(prevSS.Floats):]
                prevSS.Floats = prevSS.Floats[0:len(prevSS.Floats):len(prevSS.Floats)]
                return
        }

        return getFPointSlice(numSteps)
}

func (ev *evaluator) rangeEvalTimestampFunctionOverVectorSelector(vs *parser.VectorSelector, call FunctionCall, e *parser.Call) (parser.Value, annotations.Annotations) {
        ws, err := checkAndExpandSeriesSet(ev.ctx, vs)
        if err != nil {
                ev.error(errWithWarnings{fmt.Errorf("expanding series: %w", err), ws})
        }

        seriesIterators := make([]*storage.MemoizedSeriesIterator, len(vs.Series))
        for i, s := range vs.Series {
                it := s.Iterator(nil)
                seriesIterators[i] = storage.NewMemoizedIterator(it, durationMilliseconds(ev.lookbackDelta))
        }

        return ev.rangeEval(nil, func(v []parser.Value, _ [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
                if vs.Timestamp != nil {
                        // This is a special case for "timestamp()" when the @ modifier is used, to ensure that
                        // we return a point for each time step in this case.
                        // See https://github.com/prometheus/prometheus/issues/8433.
                        vs.Offset = time.Duration(enh.Ts-*vs.Timestamp) * time.Millisecond
                }

                vec := make(Vector, 0, len(vs.Series))
                for i, s := range vs.Series {
                        it := seriesIterators[i]
                        t, _, _, ok := ev.vectorSelectorSingle(it, vs, enh.Ts)
                        if !ok {
                                continue
                        }

                        // Note that we ignore the sample values because call only cares about the timestamp.
                        vec = append(vec, Sample{
                                Metric: s.Labels(),
                                T:      t,
                        })

                        ev.currentSamples++
                        ev.samplesStats.IncrementSamplesAtTimestamp(enh.Ts, 1)
                        if ev.currentSamples > ev.maxSamples {
                                ev.error(ErrTooManySamples(env))
                        }
                }
                ev.samplesStats.UpdatePeak(ev.currentSamples)
                vec, annos := call([]parser.Value{vec}, e.Args, enh)
                return vec, ws.Merge(annos)
        })
}

// vectorSelectorSingle evaluates an instant vector for the iterator of one time series.
func (ev *evaluator) vectorSelectorSingle(it *storage.MemoizedSeriesIterator, node *parser.VectorSelector, ts int64) (
        int64, float64, *histogram.FloatHistogram, bool,
) {
        refTime := ts - durationMilliseconds(node.Offset)
        var t int64
        var v float64
        var h *histogram.FloatHistogram

        valueType := it.Seek(refTime)
        switch valueType {
        case chunkenc.ValNone:
                if it.Err() != nil {
                        ev.error(it.Err())
                }
        case chunkenc.ValFloat:
                t, v = it.At()
        case chunkenc.ValFloatHistogram:
                t, h = it.AtFloatHistogram()
        default:
                panic(fmt.Errorf("unknown value type %v", valueType))
        }
        if valueType == chunkenc.ValNone || t > refTime {
                var ok bool
                t, v, h, ok = it.PeekPrev()
                if !ok || t < refTime-durationMilliseconds(ev.lookbackDelta) {
                        return 0, 0, nil, false
                }
        }
        if value.IsStaleNaN(v) || (h != nil && value.IsStaleNaN(h.Sum)) {
                return 0, 0, nil, false
        }
        return t, v, h, true
}

var (
        fPointPool zeropool.Pool[[]FPoint]
        hPointPool zeropool.Pool[[]HPoint]

        // matrixSelectorHPool holds reusable histogram slices used by the matrix
        // selector. The key difference between this pool and the hPointPool is that
        // slices returned by this pool should never hold multiple copies of the same
        // histogram pointer since histogram objects are reused across query evaluation
        // steps.
        matrixSelectorHPool zeropool.Pool[[]HPoint]
)

func getFPointSlice(sz int) []FPoint {
        if p := fPointPool.Get(); p != nil {
                return p
        }

        if sz > maxPointsSliceSize {
                sz = maxPointsSliceSize
        }

        return make([]FPoint, 0, sz)
}

// putFPointSlice will return a FPoint slice of size max(maxPointsSliceSize, sz).
// This function is called with an estimated size which often can be over-estimated.
func putFPointSlice(p []FPoint) {
        if p != nil {
                fPointPool.Put(p[:0])
        }
}

// getHPointSlice will return a HPoint slice of size max(maxPointsSliceSize, sz).
// This function is called with an estimated size which often can be over-estimated.
func getHPointSlice(sz int) []HPoint {
        if p := hPointPool.Get(); p != nil {
                return p
        }

        if sz > maxPointsSliceSize {
                sz = maxPointsSliceSize
        }

        return make([]HPoint, 0, sz)
}

func putHPointSlice(p []HPoint) {
        if p != nil {
                hPointPool.Put(p[:0])
        }
}

func getMatrixSelectorHPoints() []HPoint {
        if p := matrixSelectorHPool.Get(); p != nil {
                return p
        }

        return make([]HPoint, 0, matrixSelectorSliceSize)
}

func putMatrixSelectorHPointSlice(p []HPoint) {
        if p != nil {
                matrixSelectorHPool.Put(p[:0])
        }
}

// matrixSelector evaluates a *parser.MatrixSelector expression.
func (ev *evaluator) matrixSelector(node *parser.MatrixSelector) (Matrix, annotations.Annotations) {
        var (
                vs = node.VectorSelector.(*parser.VectorSelector)

                offset = durationMilliseconds(vs.Offset)
                maxt   = ev.startTimestamp - offset
                mint   = maxt - durationMilliseconds(node.Range)
                matrix = make(Matrix, 0, len(vs.Series))

                it = storage.NewBuffer(durationMilliseconds(node.Range))
        )
        ws, err := checkAndExpandSeriesSet(ev.ctx, node)
        if err != nil {
                ev.error(errWithWarnings{fmt.Errorf("expanding series: %w", err), ws})
        }

        var chkIter chunkenc.Iterator
        series := vs.Series
        for i, s := range series {
                if err := contextDone(ev.ctx, "expression evaluation"); err != nil {
                        ev.error(err)
                }
                chkIter = s.Iterator(chkIter)
                it.Reset(chkIter)
                ss := Series{
                        Metric: series[i].Labels(),
                }

                ss.Floats, ss.Histograms = ev.matrixIterSlice(it, mint, maxt, nil, nil)
                totalSize := int64(len(ss.Floats)) + int64(totalHPointSize(ss.Histograms))
                ev.samplesStats.IncrementSamplesAtTimestamp(ev.startTimestamp, totalSize)

                if totalSize > 0 {
                        matrix = append(matrix, ss)
                } else {
                        putFPointSlice(ss.Floats)
                        putHPointSlice(ss.Histograms)
                }
        }
        return matrix, ws
}

// matrixIterSlice populates a matrix vector covering the requested range for a
// single time series, with points retrieved from an iterator.
//
// As an optimization, the matrix vector may already contain points of the same
// time series from the evaluation of an earlier step (with lower mint and maxt
// values). Any such points falling before mint are discarded; points that fall
// into the [mint, maxt] range are retained; only points with later timestamps
// are populated from the iterator.
func (ev *evaluator) matrixIterSlice(
        it *storage.BufferedSeriesIterator, mint, maxt int64,
        floats []FPoint, histograms []HPoint,
) ([]FPoint, []HPoint) {
        mintFloats, mintHistograms := mint, mint

        // First floats...
        if len(floats) > 0 && floats[len(floats)-1].T >= mint {
                // There is an overlap between previous and current ranges, retain common
                // points. In most such cases:
                //   (a) the overlap is significantly larger than the eval step; and/or
                //   (b) the number of samples is relatively small.
                // so a linear search will be as fast as a binary search.
                var drop int
                for drop = 0; floats[drop].T < mint; drop++ {
                }
                ev.currentSamples -= drop
                copy(floats, floats[drop:])
                floats = floats[:len(floats)-drop]
                // Only append points with timestamps after the last timestamp we have.
                mintFloats = floats[len(floats)-1].T + 1
        } else {
                ev.currentSamples -= len(floats)
                if floats != nil {
                        floats = floats[:0]
                }
        }

        // ...then the same for histograms. TODO(beorn7): Use generics?
        if len(histograms) > 0 && histograms[len(histograms)-1].T >= mint {
                // There is an overlap between previous and current ranges, retain common
                // points. In most such cases:
                //   (a) the overlap is significantly larger than the eval step; and/or
                //   (b) the number of samples is relatively small.
                // so a linear search will be as fast as a binary search.
                var drop int
                for drop = 0; histograms[drop].T < mint; drop++ {
                }
                // Rotate the buffer around the drop index so that points before mint can be
                // reused to store new histograms.
                tail := make([]HPoint, drop)
                copy(tail, histograms[:drop])
                copy(histograms, histograms[drop:])
                copy(histograms[len(histograms)-drop:], tail)
                histograms = histograms[:len(histograms)-drop]
                ev.currentSamples -= totalHPointSize(histograms)
                // Only append points with timestamps after the last timestamp we have.
                mintHistograms = histograms[len(histograms)-1].T + 1
        } else {
                ev.currentSamples -= totalHPointSize(histograms)
                if histograms != nil {
                        histograms = histograms[:0]
                }
        }

        soughtValueType := it.Seek(maxt)
        if soughtValueType == chunkenc.ValNone {
                if it.Err() != nil {
                        ev.error(it.Err())
                }
        }

        buf := it.Buffer()
loop:
        for {
                switch buf.Next() {
                case chunkenc.ValNone:
                        break loop
                case chunkenc.ValFloatHistogram, chunkenc.ValHistogram:
                        t := buf.AtT()
                        // Values in the buffer are guaranteed to be smaller than maxt.
                        if t >= mintHistograms {
                                if histograms == nil {
                                        histograms = getMatrixSelectorHPoints()
                                }
                                n := len(histograms)
                                if n < cap(histograms) {
                                        histograms = histograms[:n+1]
                                } else {
                                        histograms = append(histograms, HPoint{H: &histogram.FloatHistogram{}})
                                }
                                histograms[n].T, histograms[n].H = buf.AtFloatHistogram(histograms[n].H)
                                if value.IsStaleNaN(histograms[n].H.Sum) {
                                        histograms = histograms[:n]
                                        continue loop
                                }
                                ev.currentSamples += histograms[n].size()
                                if ev.currentSamples > ev.maxSamples {
                                        ev.error(ErrTooManySamples(env))
                                }
                        }
                case chunkenc.ValFloat:
                        t, f := buf.At()
                        if value.IsStaleNaN(f) {
                                continue loop
                        }
                        // Values in the buffer are guaranteed to be smaller than maxt.
                        if t >= mintFloats {
                                ev.currentSamples++
                                if ev.currentSamples > ev.maxSamples {
                                        ev.error(ErrTooManySamples(env))
                                }
                                if floats == nil {
                                        floats = getFPointSlice(16)
                                }
                                floats = append(floats, FPoint{T: t, F: f})
                        }
                }
        }
        // The sought sample might also be in the range.
        switch soughtValueType {
        case chunkenc.ValFloatHistogram, chunkenc.ValHistogram:
                if it.AtT() != maxt {
                        break
                }
                if histograms == nil {
                        histograms = getMatrixSelectorHPoints()
                }
                n := len(histograms)
                if n < cap(histograms) {
                        histograms = histograms[:n+1]
                } else {
                        histograms = append(histograms, HPoint{H: &histogram.FloatHistogram{}})
                }
                histograms[n].T, histograms[n].H = it.AtFloatHistogram(histograms[n].H)
                if value.IsStaleNaN(histograms[n].H.Sum) {
                        histograms = histograms[:n]
                        break
                }
                ev.currentSamples += histograms[n].size()
                if ev.currentSamples > ev.maxSamples {
                        ev.error(ErrTooManySamples(env))
                }

        case chunkenc.ValFloat:
                t, f := it.At()
                if t == maxt && !value.IsStaleNaN(f) {
                        ev.currentSamples++
                        if ev.currentSamples > ev.maxSamples {
                                ev.error(ErrTooManySamples(env))
                        }
                        if floats == nil {
                                floats = getFPointSlice(16)
                        }
                        floats = append(floats, FPoint{T: t, F: f})
                }
        }
        ev.samplesStats.UpdatePeak(ev.currentSamples)
        return floats, histograms
}

func (ev *evaluator) VectorAnd(lhs, rhs Vector, matching *parser.VectorMatching, lhsh, rhsh []EvalSeriesHelper, enh *EvalNodeHelper) Vector {
        if matching.Card != parser.CardManyToMany {
                panic("set operations must only use many-to-many matching")
        }
        if len(lhs) == 0 || len(rhs) == 0 {
                return nil // Short-circuit: AND with nothing is nothing.
        }

        // The set of signatures for the right-hand side Vector.
        rightSigs := map[string]struct{}{}
        // Add all rhs samples to a map so we can easily find matches later.
        for _, sh := range rhsh {
                rightSigs[sh.signature] = struct{}{}
        }

        for i, ls := range lhs {
                // If there's a matching entry in the right-hand side Vector, add the sample.
                if _, ok := rightSigs[lhsh[i].signature]; ok {
                        enh.Out = append(enh.Out, ls)
                }
        }
        return enh.Out
}

func (ev *evaluator) VectorOr(lhs, rhs Vector, matching *parser.VectorMatching, lhsh, rhsh []EvalSeriesHelper, enh *EvalNodeHelper) Vector {
        switch {
        case matching.Card != parser.CardManyToMany:
                panic("set operations must only use many-to-many matching")
        case len(lhs) == 0: // Short-circuit.
                enh.Out = append(enh.Out, rhs...)
                return enh.Out
        case len(rhs) == 0:
                enh.Out = append(enh.Out, lhs...)
                return enh.Out
        }

        leftSigs := map[string]struct{}{}
        // Add everything from the left-hand-side Vector.
        for i, ls := range lhs {
                leftSigs[lhsh[i].signature] = struct{}{}
                enh.Out = append(enh.Out, ls)
        }
        // Add all right-hand side elements which have not been added from the left-hand side.
        for j, rs := range rhs {
                if _, ok := leftSigs[rhsh[j].signature]; !ok {
                        enh.Out = append(enh.Out, rs)
                }
        }
        return enh.Out
}

func (ev *evaluator) VectorUnless(lhs, rhs Vector, matching *parser.VectorMatching, lhsh, rhsh []EvalSeriesHelper, enh *EvalNodeHelper) Vector {
        if matching.Card != parser.CardManyToMany {
                panic("set operations must only use many-to-many matching")
        }
        // Short-circuit: empty rhs means we will return everything in lhs;
        // empty lhs means we will return empty - don't need to build a map.
        if len(lhs) == 0 || len(rhs) == 0 {
                enh.Out = append(enh.Out, lhs...)
                return enh.Out
        }

        rightSigs := map[string]struct{}{}
        for _, sh := range rhsh {
                rightSigs[sh.signature] = struct{}{}
        }

        for i, ls := range lhs {
                if _, ok := rightSigs[lhsh[i].signature]; !ok {
                        enh.Out = append(enh.Out, ls)
                }
        }
        return enh.Out
}

// VectorBinop evaluates a binary operation between two Vectors, excluding set operators.
func (ev *evaluator) VectorBinop(op parser.ItemType, lhs, rhs Vector, matching *parser.VectorMatching, returnBool bool, lhsh, rhsh []EvalSeriesHelper, enh *EvalNodeHelper) (Vector, error) {
        if matching.Card == parser.CardManyToMany {
                panic("many-to-many only allowed for set operators")
        }
        if len(lhs) == 0 || len(rhs) == 0 {
                return nil, nil // Short-circuit: nothing is going to match.
        }

        // The control flow below handles one-to-one or many-to-one matching.
        // For one-to-many, swap sidedness and account for the swap when calculating
        // values.
        if matching.Card == parser.CardOneToMany {
                lhs, rhs = rhs, lhs
                lhsh, rhsh = rhsh, lhsh
        }

        // All samples from the rhs hashed by the matching label/values.
        if enh.rightSigs == nil {
                enh.rightSigs = make(map[string]Sample, len(enh.Out))
        } else {
                for k := range enh.rightSigs {
                        delete(enh.rightSigs, k)
                }
        }
        rightSigs := enh.rightSigs

        // Add all rhs samples to a map so we can easily find matches later.
        for i, rs := range rhs {
                sig := rhsh[i].signature
                // The rhs is guaranteed to be the 'one' side. Having multiple samples
                // with the same signature means that the matching is many-to-many.
                if duplSample, found := rightSigs[sig]; found {
                        // oneSide represents which side of the vector represents the 'one' in the many-to-one relationship.
                        oneSide := "right"
                        if matching.Card == parser.CardOneToMany {
                                oneSide = "left"
                        }
                        matchedLabels := rs.Metric.MatchLabels(matching.On, matching.MatchingLabels...)
                        // Many-to-many matching not allowed.
                        ev.errorf("found duplicate series for the match group %s on the %s hand-side of the operation: [%s, %s]"+
                                ";many-to-many matching not allowed: matching labels must be unique on one side", matchedLabels.String(), oneSide, rs.Metric.String(), duplSample.Metric.String())
                }
                rightSigs[sig] = rs
        }

        // Tracks the match-signature. For one-to-one operations the value is nil. For many-to-one
        // the value is a set of signatures to detect duplicated result elements.
        if enh.matchedSigs == nil {
                enh.matchedSigs = make(map[string]map[uint64]struct{}, len(rightSigs))
        } else {
                for k := range enh.matchedSigs {
                        delete(enh.matchedSigs, k)
                }
        }
        matchedSigs := enh.matchedSigs

        // For all lhs samples find a respective rhs sample and perform
        // the binary operation.
        var lastErr error
        for i, ls := range lhs {
                sig := lhsh[i].signature

                rs, found := rightSigs[sig] // Look for a match in the rhs Vector.
                if !found {
                        continue
                }

                // Account for potentially swapped sidedness.
                fl, fr := ls.F, rs.F
                hl, hr := ls.H, rs.H
                if matching.Card == parser.CardOneToMany {
                        fl, fr = fr, fl
                        hl, hr = hr, hl
                }
                floatValue, histogramValue, keep, err := vectorElemBinop(op, fl, fr, hl, hr)
                if err != nil {
                        lastErr = err
                }
                switch {
                case returnBool:
                        if keep {
                                floatValue = 1.0
                        } else {
                                floatValue = 0.0
                        }
                case !keep:
                        continue
                }
                metric := resultMetric(ls.Metric, rs.Metric, op, matching, enh)
                if returnBool {
                        metric = metric.DropMetricName()
                }
                insertedSigs, exists := matchedSigs[sig]
                if matching.Card == parser.CardOneToOne {
                        if exists {
                                ev.errorf("multiple matches for labels: many-to-one matching must be explicit (group_left/group_right)")
                        }
                        matchedSigs[sig] = nil // Set existence to true.
                } else {
                        // In many-to-one matching the grouping labels have to ensure a unique metric
                        // for the result Vector. Check whether those labels have already been added for
                        // the same matching labels.
                        insertSig := metric.Hash()

                        if !exists {
                                insertedSigs = map[uint64]struct{}{}
                                matchedSigs[sig] = insertedSigs
                        } else if _, duplicate := insertedSigs[insertSig]; duplicate {
                                ev.errorf("multiple matches for labels: grouping labels must ensure unique matches")
                        }
                        insertedSigs[insertSig] = struct{}{}
                }

                enh.Out = append(enh.Out, Sample{
                        Metric: metric,
                        F:      floatValue,
                        H:      histogramValue,
                })
        }
        return enh.Out, lastErr
}

func signatureFunc(on bool, b []byte, names ...string) func(labels.Labels) string {
        if on {
                slices.Sort(names)
                return func(lset labels.Labels) string {
                        return string(lset.BytesWithLabels(b, names...))
                }
        }
        names = append([]string{labels.MetricName}, names...)
        slices.Sort(names)
        return func(lset labels.Labels) string {
                return string(lset.BytesWithoutLabels(b, names...))
        }
}

// resultMetric returns the metric for the given sample(s) based on the Vector
// binary operation and the matching options.
func resultMetric(lhs, rhs labels.Labels, op parser.ItemType, matching *parser.VectorMatching, enh *EvalNodeHelper) labels.Labels {
        if enh.resultMetric == nil {
                enh.resultMetric = make(map[string]labels.Labels, len(enh.Out))
        }

        enh.resetBuilder(lhs)
        buf := bytes.NewBuffer(enh.lblResultBuf[:0])
        enh.lblBuf = lhs.Bytes(enh.lblBuf)
        buf.Write(enh.lblBuf)
        enh.lblBuf = rhs.Bytes(enh.lblBuf)
        buf.Write(enh.lblBuf)
        enh.lblResultBuf = buf.Bytes()

        if ret, ok := enh.resultMetric[string(enh.lblResultBuf)]; ok {
                return ret
        }
        str := string(enh.lblResultBuf)

        if shouldDropMetricName(op) {
                enh.lb.Del(labels.MetricName)
        }

        if matching.Card == parser.CardOneToOne {
                if matching.On {
                        enh.lb.Keep(matching.MatchingLabels...)
                } else {
                        enh.lb.Del(matching.MatchingLabels...)
                }
        }
        for _, ln := range matching.Include {
                // Included labels from the `group_x` modifier are taken from the "one"-side.
                if v := rhs.Get(ln); v != "" {
                        enh.lb.Set(ln, v)
                } else {
                        enh.lb.Del(ln)
                }
        }

        ret := enh.lb.Labels()
        enh.resultMetric[str] = ret
        return ret
}

// VectorscalarBinop evaluates a binary operation between a Vector and a Scalar.
func (ev *evaluator) VectorscalarBinop(op parser.ItemType, lhs Vector, rhs Scalar, swap, returnBool bool, enh *EvalNodeHelper) (Vector, error) {
        var lastErr error
        for _, lhsSample := range lhs {
                lf, rf := lhsSample.F, rhs.V
                var rh *histogram.FloatHistogram
                lh := lhsSample.H
                // lhs always contains the Vector. If the original position was different
                // swap for calculating the value.
                if swap {
                        lf, rf = rf, lf
                        lh, rh = rh, lh
                }
                float, histogram, keep, err := vectorElemBinop(op, lf, rf, lh, rh)
                if err != nil {
                        lastErr = err
                }
                // Catch cases where the scalar is the LHS in a scalar-vector comparison operation.
                // We want to always keep the vector element value as the output value, even if it's on the RHS.
                if op.IsComparisonOperator() && swap {
                        float = rf
                        histogram = rh
                }
                if returnBool {
                        if keep {
                                float = 1.0
                        } else {
                                float = 0.0
                        }
                        keep = true
                }
                if keep {
                        lhsSample.F = float
                        lhsSample.H = histogram
                        if shouldDropMetricName(op) || returnBool {
                                lhsSample.Metric = lhsSample.Metric.DropMetricName()
                        }
                        enh.Out = append(enh.Out, lhsSample)
                }
        }
        return enh.Out, lastErr
}

// scalarBinop evaluates a binary operation between two Scalars.
func scalarBinop(op parser.ItemType, lhs, rhs float64) float64 {
        switch op {
        case parser.ADD:
                return lhs + rhs
        case parser.SUB:
                return lhs - rhs
        case parser.MUL:
                return lhs * rhs
        case parser.DIV:
                return lhs / rhs
        case parser.POW:
                return math.Pow(lhs, rhs)
        case parser.MOD:
                return math.Mod(lhs, rhs)
        case parser.EQLC:
                return btos(lhs == rhs)
        case parser.NEQ:
                return btos(lhs != rhs)
        case parser.GTR:
                return btos(lhs > rhs)
        case parser.LSS:
                return btos(lhs < rhs)
        case parser.GTE:
                return btos(lhs >= rhs)
        case parser.LTE:
                return btos(lhs <= rhs)
        case parser.ATAN2:
                return math.Atan2(lhs, rhs)
        }
        panic(fmt.Errorf("operator %q not allowed for Scalar operations", op))
}

// vectorElemBinop evaluates a binary operation between two Vector elements.
func vectorElemBinop(op parser.ItemType, lhs, rhs float64, hlhs, hrhs *histogram.FloatHistogram) (float64, *histogram.FloatHistogram, bool, error) {
        switch op {
        case parser.ADD:
                if hlhs != nil && hrhs != nil {
                        res, err := hlhs.Copy().Add(hrhs)
                        if err != nil {
                                return 0, nil, false, err
                        }
                        return 0, res.Compact(0), true, nil
                }
                return lhs + rhs, nil, true, nil
        case parser.SUB:
                if hlhs != nil && hrhs != nil {
                        res, err := hlhs.Copy().Sub(hrhs)
                        if err != nil {
                                return 0, nil, false, err
                        }
                        return 0, res.Compact(0), true, nil
                }
                return lhs - rhs, nil, true, nil
        case parser.MUL:
                if hlhs != nil && hrhs == nil {
                        return 0, hlhs.Copy().Mul(rhs), true, nil
                }
                if hlhs == nil && hrhs != nil {
                        return 0, hrhs.Copy().Mul(lhs), true, nil
                }
                return lhs * rhs, nil, true, nil
        case parser.DIV:
                if hlhs != nil && hrhs == nil {
                        return 0, hlhs.Copy().Div(rhs), true, nil
                }
                return lhs / rhs, nil, true, nil
        case parser.POW:
                return math.Pow(lhs, rhs), nil, true, nil
        case parser.MOD:
                return math.Mod(lhs, rhs), nil, true, nil
        case parser.EQLC:
                return lhs, nil, lhs == rhs, nil
        case parser.NEQ:
                return lhs, nil, lhs != rhs, nil
        case parser.GTR:
                return lhs, nil, lhs > rhs, nil
        case parser.LSS:
                return lhs, nil, lhs < rhs, nil
        case parser.GTE:
                return lhs, nil, lhs >= rhs, nil
        case parser.LTE:
                return lhs, nil, lhs <= rhs, nil
        case parser.ATAN2:
                return math.Atan2(lhs, rhs), nil, true, nil
        }
        panic(fmt.Errorf("operator %q not allowed for operations between Vectors", op))
}

type groupedAggregation struct {
        seen           bool // Was this output groups seen in the input at this timestamp.
        hasFloat       bool // Has at least 1 float64 sample aggregated.
        hasHistogram   bool // Has at least 1 histogram sample aggregated.
        floatValue     float64
        histogramValue *histogram.FloatHistogram
        floatMean      float64 // Mean, or "compensating value" for Kahan summation.
        groupCount     int
        heap           vectorByValueHeap
}

// aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix.
// These functions produce one output series for each group specified in the expression, with just the labels from `by(...)`.
// outputMatrix should be already populated with grouping labels; groups is one-to-one with outputMatrix.
// seriesToResult maps inputMatrix indexes to outputMatrix indexes.
func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix, outputMatrix Matrix, seriesToResult []int, groups []groupedAggregation, enh *EvalNodeHelper) annotations.Annotations {
        op := e.Op
        var annos annotations.Annotations
        for i := range groups {
                groups[i].seen = false
        }

        for si := range inputMatrix {
                f, h, ok := ev.nextValues(enh.Ts, &inputMatrix[si])
                if !ok {
                        continue
                }

                group := &groups[seriesToResult[si]]
                // Initialize this group if it's the first time we've seen it.
                if !group.seen {
                        *group = groupedAggregation{
                                seen:       true,
                                floatValue: f,
                                groupCount: 1,
                        }
                        switch op {
                        case parser.AVG:
                                group.floatMean = f
                                fallthrough
                        case parser.SUM:
                                if h == nil {
                                        group.hasFloat = true
                                } else {
                                        group.histogramValue = h.Copy()
                                        group.hasHistogram = true
                                }
                        case parser.STDVAR, parser.STDDEV:
                                group.floatMean = f
                                group.floatValue = 0
                        case parser.QUANTILE:
                                group.heap = make(vectorByValueHeap, 1)
                                group.heap[0] = Sample{F: f}
                        case parser.GROUP:
                                group.floatValue = 1
                        }
                        continue
                }

                switch op {
                case parser.SUM:
                        if h != nil {
                                group.hasHistogram = true
                                if group.histogramValue != nil {
                                        _, err := group.histogramValue.Add(h)
                                        if err != nil {
                                                handleAggregationError(err, e, inputMatrix[si].Metric.Get(model.MetricNameLabel), &annos)
                                        }
                                }
                                // Otherwise the aggregation contained floats
                                // previously and will be invalid anyway. No
                                // point in copying the histogram in that case.
                        } else {
                                group.hasFloat = true
                                group.floatValue, group.floatMean = kahanSumInc(f, group.floatValue, group.floatMean)
                        }

                case parser.AVG:
                        group.groupCount++
                        if h != nil {
                                group.hasHistogram = true
                                if group.histogramValue != nil {
                                        left := h.Copy().Div(float64(group.groupCount))
                                        right := group.histogramValue.Copy().Div(float64(group.groupCount))
                                        toAdd, err := left.Sub(right)
                                        if err != nil {
                                                handleAggregationError(err, e, inputMatrix[si].Metric.Get(model.MetricNameLabel), &annos)
                                        }
                                        _, err = group.histogramValue.Add(toAdd)
                                        if err != nil {
                                                handleAggregationError(err, e, inputMatrix[si].Metric.Get(model.MetricNameLabel), &annos)
                                        }
                                }
                                // Otherwise the aggregation contained floats
                                // previously and will be invalid anyway. No
                                // point in copying the histogram in that case.
                        } else {
                                group.hasFloat = true
                                if math.IsInf(group.floatMean, 0) {
                                        if math.IsInf(f, 0) && (group.floatMean > 0) == (f > 0) {
                                                // The `floatMean` and `s.F` values are `Inf` of the same sign.  They
                                                // can't be subtracted, but the value of `floatMean` is correct
                                                // already.
                                                break
                                        }
                                        if !math.IsInf(f, 0) && !math.IsNaN(f) {
                                                // At this stage, the mean is an infinite. If the added
                                                // value is neither an Inf or a Nan, we can keep that mean
                                                // value.
                                                // This is required because our calculation below removes
                                                // the mean value, which would look like Inf += x - Inf and
                                                // end up as a NaN.
                                                break
                                        }
                                }
                                // Divide each side of the `-` by `group.groupCount` to avoid float64 overflows.
                                group.floatMean += f/float64(group.groupCount) - group.floatMean/float64(group.groupCount)
                        }

                case parser.GROUP:
                        // Do nothing. Required to avoid the panic in `default:` below.

                case parser.MAX:
                        if group.floatValue < f || math.IsNaN(group.floatValue) {
                                group.floatValue = f
                        }

                case parser.MIN:
                        if group.floatValue > f || math.IsNaN(group.floatValue) {
                                group.floatValue = f
                        }

                case parser.COUNT:
                        group.groupCount++

                case parser.STDVAR, parser.STDDEV:
                        if h == nil { // Ignore native histograms.
                                group.groupCount++
                                delta := f - group.floatMean
                                group.floatMean += delta / float64(group.groupCount)
                                group.floatValue += delta * (f - group.floatMean)
                        }

                case parser.QUANTILE:
                        group.heap = append(group.heap, Sample{F: f})

                default:
                        panic(fmt.Errorf("expected aggregation operator but got %q", op))
                }
        }

        // Construct the output matrix from the aggregated groups.
        numSteps := int((ev.endTimestamp-ev.startTimestamp)/ev.interval) + 1

        for ri, aggr := range groups {
                if !aggr.seen {
                        continue
                }
                switch op {
                case parser.AVG:
                        if aggr.hasFloat && aggr.hasHistogram {
                                // We cannot aggregate histogram sample with a float64 sample.
                                annos.Add(annotations.NewMixedFloatsHistogramsAggWarning(e.Expr.PositionRange()))
                                continue
                        }
                        if aggr.hasHistogram {
                                aggr.histogramValue = aggr.histogramValue.Compact(0)
                        } else {
                                aggr.floatValue = aggr.floatMean
                        }

                case parser.COUNT:
                        aggr.floatValue = float64(aggr.groupCount)

                case parser.STDVAR:
                        aggr.floatValue /= float64(aggr.groupCount)

                case parser.STDDEV:
                        aggr.floatValue = math.Sqrt(aggr.floatValue / float64(aggr.groupCount))

                case parser.QUANTILE:
                        aggr.floatValue = quantile(q, aggr.heap)

                case parser.SUM:
                        if aggr.hasFloat && aggr.hasHistogram {
                                // We cannot aggregate histogram sample with a float64 sample.
                                annos.Add(annotations.NewMixedFloatsHistogramsAggWarning(e.Expr.PositionRange()))
                                continue
                        }
                        if aggr.hasHistogram {
                                aggr.histogramValue.Compact(0)
                        } else {
                                aggr.floatValue += aggr.floatMean // Add Kahan summation compensating term.
                        }
                default:
                        // For other aggregations, we already have the right value.
                }

                ss := &outputMatrix[ri]
                addToSeries(ss, enh.Ts, aggr.floatValue, aggr.histogramValue, numSteps)
        }

        return annos
}

// aggregationK evaluates topk or bottomk at one timestep on inputMatrix.
// Output that has the same labels as the input, but just k of them per group.
// seriesToResult maps inputMatrix indexes to groups indexes.
// For an instant query, returns a Matrix in descending order for topk or ascending for bottomk.
// For a range query, aggregates output in the seriess map.
func (ev *evaluator) aggregationK(e *parser.AggregateExpr, k int, inputMatrix Matrix, seriesToResult []int, groups []groupedAggregation, enh *EvalNodeHelper, seriess map[uint64]Series) (Matrix, annotations.Annotations) {
        op := e.Op
        var s Sample
        var annos annotations.Annotations
        for i := range groups {
                groups[i].seen = false
        }

        for si := range inputMatrix {
                f, _, ok := ev.nextValues(enh.Ts, &inputMatrix[si])
                if !ok {
                        continue
                }
                s = Sample{Metric: inputMatrix[si].Metric, F: f}

                group := &groups[seriesToResult[si]]
                // Initialize this group if it's the first time we've seen it.
                if !group.seen {
                        *group = groupedAggregation{
                                seen: true,
                                heap: make(vectorByValueHeap, 1, k),
                        }
                        group.heap[0] = s
                        continue
                }

                switch op {
                case parser.TOPK:
                        // We build a heap of up to k elements, with the smallest element at heap[0].
                        switch {
                        case len(group.heap) < k:
                                heap.Push(&group.heap, &s)
                        case group.heap[0].F < s.F || (math.IsNaN(group.heap[0].F) && !math.IsNaN(s.F)):
                                // This new element is bigger than the previous smallest element - overwrite that.
                                group.heap[0] = s
                                if k > 1 {
                                        heap.Fix(&group.heap, 0) // Maintain the heap invariant.
                                }
                        }

                case parser.BOTTOMK:
                        // We build a heap of up to k elements, with the biggest element at heap[0].
                        switch {
                        case len(group.heap) < k:
                                heap.Push((*vectorByReverseValueHeap)(&group.heap), &s)
                        case group.heap[0].F > s.F || (math.IsNaN(group.heap[0].F) && !math.IsNaN(s.F)):
                                // This new element is smaller than the previous biggest element - overwrite that.
                                group.heap[0] = s
                                if k > 1 {
                                        heap.Fix((*vectorByReverseValueHeap)(&group.heap), 0) // Maintain the heap invariant.
                                }
                        }

                default:
                        panic(fmt.Errorf("expected aggregation operator but got %q", op))
                }
        }

        // Construct the result from the aggregated groups.
        numSteps := int((ev.endTimestamp-ev.startTimestamp)/ev.interval) + 1
        var mat Matrix
        if ev.endTimestamp == ev.startTimestamp {
                mat = make(Matrix, 0, len(groups))
        }

        add := func(lbls labels.Labels, f float64) {
                // If this could be an instant query, add directly to the matrix so the result is in consistent order.
                if ev.endTimestamp == ev.startTimestamp {
                        mat = append(mat, Series{Metric: lbls, Floats: []FPoint{{T: enh.Ts, F: f}}})
                } else {
                        // Otherwise the results are added into seriess elements.
                        hash := lbls.Hash()
                        ss, ok := seriess[hash]
                        if !ok {
                                ss = Series{Metric: lbls}
                        }
                        addToSeries(&ss, enh.Ts, f, nil, numSteps)
                        seriess[hash] = ss
                }
        }
        for _, aggr := range groups {
                if !aggr.seen {
                        continue
                }
                switch op {
                case parser.TOPK:
                        // The heap keeps the lowest value on top, so reverse it.
                        if len(aggr.heap) > 1 {
                                sort.Sort(sort.Reverse(aggr.heap))
                        }
                        for _, v := range aggr.heap {
                                add(v.Metric, v.F)
                        }

                case parser.BOTTOMK:
                        // The heap keeps the highest value on top, so reverse it.
                        if len(aggr.heap) > 1 {
                                sort.Sort(sort.Reverse((*vectorByReverseValueHeap)(&aggr.heap)))
                        }
                        for _, v := range aggr.heap {
                                add(v.Metric, v.F)
                        }
                }
        }

        return mat, annos
}

// aggregationK evaluates count_values on vec.
// Outputs as many series per group as there are values in the input.
func (ev *evaluator) aggregationCountValues(e *parser.AggregateExpr, grouping []string, valueLabel string, vec Vector, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        type groupCount struct {
                labels labels.Labels
                count  int
        }
        result := map[uint64]*groupCount{}

        var buf []byte
        for _, s := range vec {
                enh.resetBuilder(s.Metric)
                enh.lb.Set(valueLabel, strconv.FormatFloat(s.F, 'f', -1, 64))
                metric := enh.lb.Labels()

                // Considering the count_values()
                // operator is less frequently used than other aggregations, we're fine having to
                // re-compute the grouping key on each step for this case.
                var groupingKey uint64
                groupingKey, buf = generateGroupingKey(metric, grouping, e.Without, buf)

                group, ok := result[groupingKey]
                // Add a new group if it doesn't exist.
                if !ok {
                        result[groupingKey] = &groupCount{
                                labels: generateGroupingLabels(enh, metric, e.Without, grouping),
                                count:  1,
                        }
                        continue
                }

                group.count++
        }

        // Construct the result Vector from the aggregated groups.
        for _, aggr := range result {
                enh.Out = append(enh.Out, Sample{
                        Metric: aggr.labels,
                        F:      float64(aggr.count),
                })
        }
        return enh.Out, nil
}

func addToSeries(ss *Series, ts int64, f float64, h *histogram.FloatHistogram, numSteps int) {
        if h == nil {
                if ss.Floats == nil {
                        ss.Floats = getFPointSlice(numSteps)
                }
                ss.Floats = append(ss.Floats, FPoint{T: ts, F: f})
                return
        }
        if ss.Histograms == nil {
                ss.Histograms = getHPointSlice(numSteps)
        }
        ss.Histograms = append(ss.Histograms, HPoint{T: ts, H: h})
}

func (ev *evaluator) nextValues(ts int64, series *Series) (f float64, h *histogram.FloatHistogram, b bool) {
        switch {
        case len(series.Floats) > 0 && series.Floats[0].T == ts:
                f = series.Floats[0].F
                series.Floats = series.Floats[1:] // Move input vectors forward
        case len(series.Histograms) > 0 && series.Histograms[0].T == ts:
                h = series.Histograms[0].H
                series.Histograms = series.Histograms[1:]
        default:
                return f, h, false
        }
        return f, h, true
}

// handleAggregationError adds the appropriate annotation based on the aggregation error.
func handleAggregationError(err error, e *parser.AggregateExpr, metricName string, annos *annotations.Annotations) {
        pos := e.Expr.PositionRange()
        if errors.Is(err, histogram.ErrHistogramsIncompatibleSchema) {
                annos.Add(annotations.NewMixedExponentialCustomHistogramsWarning(metricName, pos))
        } else if errors.Is(err, histogram.ErrHistogramsIncompatibleBounds) {
                annos.Add(annotations.NewIncompatibleCustomBucketsHistogramsWarning(metricName, pos))
        }
}

// handleVectorBinopError returns the appropriate annotation based on the vector binary operation error.
func handleVectorBinopError(err error, e *parser.BinaryExpr) annotations.Annotations {
        if err == nil {
                return nil
        }
        metricName := ""
        pos := e.PositionRange()
        if errors.Is(err, histogram.ErrHistogramsIncompatibleSchema) {
                return annotations.New().Add(annotations.NewMixedExponentialCustomHistogramsWarning(metricName, pos))
        } else if errors.Is(err, histogram.ErrHistogramsIncompatibleBounds) {
                return annotations.New().Add(annotations.NewIncompatibleCustomBucketsHistogramsWarning(metricName, pos))
        }
        return nil
}

// groupingKey builds and returns the grouping key for the given metric and
// grouping labels.
func generateGroupingKey(metric labels.Labels, grouping []string, without bool, buf []byte) (uint64, []byte) {
        if without {
                return metric.HashWithoutLabels(buf, grouping...)
        }

        if len(grouping) == 0 {
                // No need to generate any hash if there are no grouping labels.
                return 0, buf
        }

        return metric.HashForLabels(buf, grouping...)
}

func generateGroupingLabels(enh *EvalNodeHelper, metric labels.Labels, without bool, grouping []string) labels.Labels {
        enh.resetBuilder(metric)
        switch {
        case without:
                enh.lb.Del(grouping...)
                enh.lb.Del(labels.MetricName)
                return enh.lb.Labels()
        case len(grouping) > 0:
                enh.lb.Keep(grouping...)
                return enh.lb.Labels()
        default:
                return labels.EmptyLabels()
        }
}

// btos returns 1 if b is true, 0 otherwise.
func btos(b bool) float64 {
        if b {
                return 1
        }
        return 0
}

// shouldDropMetricName returns whether the metric name should be dropped in the
// result of the op operation.
func shouldDropMetricName(op parser.ItemType) bool {
        switch op {
        case parser.ADD, parser.SUB, parser.DIV, parser.MUL, parser.POW, parser.MOD, parser.ATAN2:
                return true
        default:
                return false
        }
}

// NewOriginContext returns a new context with data about the origin attached.
func NewOriginContext(ctx context.Context, data map[string]interface{}) context.Context {
        return context.WithValue(ctx, QueryOrigin{}, data)
}

func formatDate(t time.Time) string {
        return t.UTC().Format("2006-01-02T15:04:05.000Z07:00")
}

// unwrapParenExpr does the AST equivalent of removing parentheses around a expression.
func unwrapParenExpr(e *parser.Expr) {
        for {
                if p, ok := (*e).(*parser.ParenExpr); ok {
                        *e = p.Expr
                } else {
                        break
                }
        }
}

func unwrapStepInvariantExpr(e parser.Expr) parser.Expr {
        if p, ok := e.(*parser.StepInvariantExpr); ok {
                return p.Expr
        }
        return e
}

// PreprocessExpr wraps all possible step invariant parts of the given expression with
// StepInvariantExpr. It also resolves the preprocessors.
func PreprocessExpr(expr parser.Expr, start, end time.Time) parser.Expr {
        detectHistogramStatsDecoding(expr)

        isStepInvariant := preprocessExprHelper(expr, start, end)
        if isStepInvariant {
                return newStepInvariantExpr(expr)
        }
        return expr
}

// preprocessExprHelper wraps the child nodes of the expression
// with a StepInvariantExpr wherever it's step invariant. The returned boolean is true if the
// passed expression qualifies to be wrapped by StepInvariantExpr.
// It also resolves the preprocessors.
func preprocessExprHelper(expr parser.Expr, start, end time.Time) bool {
        switch n := expr.(type) {
        case *parser.VectorSelector:
                switch n.StartOrEnd {
                case parser.START:
                        n.Timestamp = makeInt64Pointer(timestamp.FromTime(start))
                case parser.END:
                        n.Timestamp = makeInt64Pointer(timestamp.FromTime(end))
                }
                return n.Timestamp != nil

        case *parser.AggregateExpr:
                return preprocessExprHelper(n.Expr, start, end)

        case *parser.BinaryExpr:
                isInvariant1, isInvariant2 := preprocessExprHelper(n.LHS, start, end), preprocessExprHelper(n.RHS, start, end)
                if isInvariant1 && isInvariant2 {
                        return true
                }

                if isInvariant1 {
                        n.LHS = newStepInvariantExpr(n.LHS)
                }
                if isInvariant2 {
                        n.RHS = newStepInvariantExpr(n.RHS)
                }

                return false

        case *parser.Call:
                _, ok := AtModifierUnsafeFunctions[n.Func.Name]
                isStepInvariant := !ok
                isStepInvariantSlice := make([]bool, len(n.Args))
                for i := range n.Args {
                        isStepInvariantSlice[i] = preprocessExprHelper(n.Args[i], start, end)
                        isStepInvariant = isStepInvariant && isStepInvariantSlice[i]
                }

                if isStepInvariant {
                        // The function and all arguments are step invariant.
                        return true
                }

                for i, isi := range isStepInvariantSlice {
                        if isi {
                                n.Args[i] = newStepInvariantExpr(n.Args[i])
                        }
                }
                return false

        case *parser.MatrixSelector:
                return preprocessExprHelper(n.VectorSelector, start, end)

        case *parser.SubqueryExpr:
                // Since we adjust offset for the @ modifier evaluation,
                // it gets tricky to adjust it for every subquery step.
                // Hence we wrap the inside of subquery irrespective of
                // @ on subquery (given it is also step invariant) so that
                // it is evaluated only once w.r.t. the start time of subquery.
                isInvariant := preprocessExprHelper(n.Expr, start, end)
                if isInvariant {
                        n.Expr = newStepInvariantExpr(n.Expr)
                }
                switch n.StartOrEnd {
                case parser.START:
                        n.Timestamp = makeInt64Pointer(timestamp.FromTime(start))
                case parser.END:
                        n.Timestamp = makeInt64Pointer(timestamp.FromTime(end))
                }
                return n.Timestamp != nil

        case *parser.ParenExpr:
                return preprocessExprHelper(n.Expr, start, end)

        case *parser.UnaryExpr:
                return preprocessExprHelper(n.Expr, start, end)

        case *parser.StringLiteral, *parser.NumberLiteral:
                return true
        }

        panic(fmt.Sprintf("found unexpected node %#v", expr))
}

func newStepInvariantExpr(expr parser.Expr) parser.Expr {
        return &parser.StepInvariantExpr{Expr: expr}
}

// setOffsetForAtModifier modifies the offset of vector and matrix selector
// and subquery in the tree to accommodate the timestamp of @ modifier.
// The offset is adjusted w.r.t. the given evaluation time.
func setOffsetForAtModifier(evalTime int64, expr parser.Expr) {
        getOffset := func(ts *int64, originalOffset time.Duration, path []parser.Node) time.Duration {
                if ts == nil {
                        return originalOffset
                }

                subqOffset, _, subqTs := subqueryTimes(path)
                if subqTs != nil {
                        subqOffset += time.Duration(evalTime-*subqTs) * time.Millisecond
                }

                offsetForTs := time.Duration(evalTime-*ts) * time.Millisecond
                offsetDiff := offsetForTs - subqOffset
                return originalOffset + offsetDiff
        }

        parser.Inspect(expr, func(node parser.Node, path []parser.Node) error {
                switch n := node.(type) {
                case *parser.VectorSelector:
                        n.Offset = getOffset(n.Timestamp, n.OriginalOffset, path)

                case *parser.MatrixSelector:
                        vs := n.VectorSelector.(*parser.VectorSelector)
                        vs.Offset = getOffset(vs.Timestamp, vs.OriginalOffset, path)

                case *parser.SubqueryExpr:
                        n.Offset = getOffset(n.Timestamp, n.OriginalOffset, path)
                }
                return nil
        })
}

// detectHistogramStatsDecoding modifies the expression by setting the
// SkipHistogramBuckets field in those vector selectors for which it is safe to
// return only histogram statistics (sum and count), excluding histogram spans
// and buckets. The function can be treated as an optimization and is not
// required for correctness.
func detectHistogramStatsDecoding(expr parser.Expr) {
        parser.Inspect(expr, func(node parser.Node, path []parser.Node) error {
                n, ok := (node).(*parser.VectorSelector)
                if !ok {
                        return nil
                }

                for _, p := range path {
                        call, ok := p.(*parser.Call)
                        if !ok {
                                continue
                        }
                        if call.Func.Name == "histogram_count" || call.Func.Name == "histogram_sum" {
                                n.SkipHistogramBuckets = true
                                break
                        }
                        if call.Func.Name == "histogram_quantile" || call.Func.Name == "histogram_fraction" {
                                n.SkipHistogramBuckets = false
                                break
                        }
                }
                return fmt.Errorf("stop")
        })
}

func makeInt64Pointer(val int64) *int64 {
        valp := new(int64)
        *valp = val
        return valp
}

type histogramStatsSeries struct {
        storage.Series
}

func newHistogramStatsSeries(series storage.Series) *histogramStatsSeries {
        return &histogramStatsSeries{Series: series}
}

func (s histogramStatsSeries) Iterator(it chunkenc.Iterator) chunkenc.Iterator {
        return NewHistogramStatsIterator(s.Series.Iterator(it))
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package promql

import (
        "errors"
        "fmt"
        "math"
        "slices"
        "sort"
        "strconv"
        "strings"
        "time"

        "github.com/facette/natsort"
        "github.com/grafana/regexp"
        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/promql/parser"
        "github.com/prometheus/prometheus/promql/parser/posrange"
        "github.com/prometheus/prometheus/util/annotations"
)

// FunctionCall is the type of a PromQL function implementation
//
// vals is a list of the evaluated arguments for the function call.
//
// For range vectors it will be a Matrix with one series, instant vectors a
// Vector, scalars a Vector with one series whose value is the scalar
// value,and nil for strings.
//
// args are the original arguments to the function, where you can access
// matrixSelectors, vectorSelectors, and StringLiterals.
//
// enh.Out is a pre-allocated empty vector that you may use to accumulate
// output before returning it. The vectors in vals should not be returned.a
//
// Range vector functions need only return a vector with the right value,
// the metric and timestamp are not needed.
//
// Instant vector functions need only return a vector with the right values and
// metrics, the timestamp are not needed.
//
// Scalar results should be returned as the value of a sample in a Vector.
type FunctionCall func(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations)

// === time() float64 ===
func funcTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return Vector{Sample{
                F: float64(enh.Ts) / 1000,
        }}, nil
}

// extrapolatedRate is a utility function for rate/increase/delta.
// It calculates the rate (allowing for counter resets if isCounter is true),
// extrapolates if the first/last sample is close to the boundary, and returns
// the result as either per-second (if isRate is true) or overall.
func extrapolatedRate(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper, isCounter, isRate bool) (Vector, annotations.Annotations) {
        ms := args[0].(*parser.MatrixSelector)
        vs := ms.VectorSelector.(*parser.VectorSelector)
        var (
                samples            = vals[0].(Matrix)[0]
                rangeStart         = enh.Ts - durationMilliseconds(ms.Range+vs.Offset)
                rangeEnd           = enh.Ts - durationMilliseconds(vs.Offset)
                resultFloat        float64
                resultHistogram    *histogram.FloatHistogram
                firstT, lastT      int64
                numSamplesMinusOne int
                annos              annotations.Annotations
        )

        // We need either at least two Histograms and no Floats, or at least two
        // Floats and no Histograms to calculate a rate. Otherwise, drop this
        // Vector element.
        metricName := samples.Metric.Get(labels.MetricName)
        if len(samples.Histograms) > 0 && len(samples.Floats) > 0 {
                return enh.Out, annos.Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
        }

        switch {
        case len(samples.Histograms) > 1:
                numSamplesMinusOne = len(samples.Histograms) - 1
                firstT = samples.Histograms[0].T
                lastT = samples.Histograms[numSamplesMinusOne].T
                var newAnnos annotations.Annotations
                resultHistogram, newAnnos = histogramRate(samples.Histograms, isCounter, metricName, args[0].PositionRange())
                if resultHistogram == nil {
                        // The histograms are not compatible with each other.
                        return enh.Out, annos.Merge(newAnnos)
                }
        case len(samples.Floats) > 1:
                numSamplesMinusOne = len(samples.Floats) - 1
                firstT = samples.Floats[0].T
                lastT = samples.Floats[numSamplesMinusOne].T
                resultFloat = samples.Floats[numSamplesMinusOne].F - samples.Floats[0].F
                if !isCounter {
                        break
                }
                // Handle counter resets:
                prevValue := samples.Floats[0].F
                for _, currPoint := range samples.Floats[1:] {
                        if currPoint.F < prevValue {
                                resultFloat += prevValue
                        }
                        prevValue = currPoint.F
                }
        default:
                // TODO: add RangeTooShortWarning
                return enh.Out, annos
        }

        // Duration between first/last samples and boundary of range.
        durationToStart := float64(firstT-rangeStart) / 1000
        durationToEnd := float64(rangeEnd-lastT) / 1000

        sampledInterval := float64(lastT-firstT) / 1000
        averageDurationBetweenSamples := sampledInterval / float64(numSamplesMinusOne)

        // If the first/last samples are close to the boundaries of the range,
        // extrapolate the result. This is as we expect that another sample
        // will exist given the spacing between samples we've seen thus far,
        // with an allowance for noise.
        extrapolationThreshold := averageDurationBetweenSamples * 1.1
        extrapolateToInterval := sampledInterval

        if durationToStart >= extrapolationThreshold {
                durationToStart = averageDurationBetweenSamples / 2
        }
        if isCounter && resultFloat > 0 && len(samples.Floats) > 0 && samples.Floats[0].F >= 0 {
                // Counters cannot be negative. If we have any slope at all
                // (i.e. resultFloat went up), we can extrapolate the zero point
                // of the counter. If the duration to the zero point is shorter
                // than the durationToStart, we take the zero point as the start
                // of the series, thereby avoiding extrapolation to negative
                // counter values.
                // TODO(beorn7): Do this for histograms, too.
                durationToZero := sampledInterval * (samples.Floats[0].F / resultFloat)
                if durationToZero < durationToStart {
                        durationToStart = durationToZero
                }
        }
        extrapolateToInterval += durationToStart

        if durationToEnd >= extrapolationThreshold {
                durationToEnd = averageDurationBetweenSamples / 2
        }
        extrapolateToInterval += durationToEnd

        factor := extrapolateToInterval / sampledInterval
        if isRate {
                factor /= ms.Range.Seconds()
        }
        if resultHistogram == nil {
                resultFloat *= factor
        } else {
                resultHistogram.Mul(factor)
        }

        return append(enh.Out, Sample{F: resultFloat, H: resultHistogram}), annos
}

// histogramRate is a helper function for extrapolatedRate. It requires
// points[0] to be a histogram. It returns nil if any other Point in points is
// not a histogram, and a warning wrapped in an annotation in that case.
// Otherwise, it returns the calculated histogram and an empty annotation.
func histogramRate(points []HPoint, isCounter bool, metricName string, pos posrange.PositionRange) (*histogram.FloatHistogram, annotations.Annotations) {
        prev := points[0].H
        last := points[len(points)-1].H
        if last == nil {
                return nil, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, pos))
        }
        minSchema := prev.Schema
        if last.Schema < minSchema {
                minSchema = last.Schema
        }

        var annos annotations.Annotations

        // First iteration to find out two things:
        // - What's the smallest relevant schema?
        // - Are all data points histograms?
        //   TODO(beorn7): Find a way to check that earlier, e.g. by handing in a
        //   []FloatPoint and a []HistogramPoint separately.
        for _, currPoint := range points[1 : len(points)-1] {
                curr := currPoint.H
                if curr == nil {
                        return nil, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, pos))
                }
                if !isCounter {
                        continue
                }
                if curr.CounterResetHint == histogram.GaugeType {
                        annos.Add(annotations.NewNativeHistogramNotCounterWarning(metricName, pos))
                }
                if curr.Schema < minSchema {
                        minSchema = curr.Schema
                }
        }

        h := last.CopyToSchema(minSchema)
        _, err := h.Sub(prev)
        if err != nil {
                if errors.Is(err, histogram.ErrHistogramsIncompatibleSchema) {
                        return nil, annotations.New().Add(annotations.NewMixedExponentialCustomHistogramsWarning(metricName, pos))
                } else if errors.Is(err, histogram.ErrHistogramsIncompatibleBounds) {
                        return nil, annotations.New().Add(annotations.NewIncompatibleCustomBucketsHistogramsWarning(metricName, pos))
                }
        }

        if isCounter {
                // Second iteration to deal with counter resets.
                for _, currPoint := range points[1:] {
                        curr := currPoint.H
                        if curr.DetectReset(prev) {
                                _, err := h.Add(prev)
                                if err != nil {
                                        if errors.Is(err, histogram.ErrHistogramsIncompatibleSchema) {
                                                return nil, annotations.New().Add(annotations.NewMixedExponentialCustomHistogramsWarning(metricName, pos))
                                        } else if errors.Is(err, histogram.ErrHistogramsIncompatibleBounds) {
                                                return nil, annotations.New().Add(annotations.NewIncompatibleCustomBucketsHistogramsWarning(metricName, pos))
                                        }
                                }
                        }
                        prev = curr
                }
        } else if points[0].H.CounterResetHint != histogram.GaugeType || points[len(points)-1].H.CounterResetHint != histogram.GaugeType {
                annos.Add(annotations.NewNativeHistogramNotGaugeWarning(metricName, pos))
        }

        h.CounterResetHint = histogram.GaugeType
        return h.Compact(0), nil
}

// === delta(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcDelta(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return extrapolatedRate(vals, args, enh, false, false)
}

// === rate(node parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcRate(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return extrapolatedRate(vals, args, enh, true, true)
}

// === increase(node parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcIncrease(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return extrapolatedRate(vals, args, enh, true, false)
}

// === irate(node parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcIrate(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return instantValue(vals, enh.Out, true)
}

// === idelta(node model.ValMatrix) (Vector, Annotations) ===
func funcIdelta(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return instantValue(vals, enh.Out, false)
}

func instantValue(vals []parser.Value, out Vector, isRate bool) (Vector, annotations.Annotations) {
        samples := vals[0].(Matrix)[0]
        // No sense in trying to compute a rate without at least two points. Drop
        // this Vector element.
        // TODO: add RangeTooShortWarning
        if len(samples.Floats) < 2 {
                return out, nil
        }

        lastSample := samples.Floats[len(samples.Floats)-1]
        previousSample := samples.Floats[len(samples.Floats)-2]

        var resultValue float64
        if isRate && lastSample.F < previousSample.F {
                // Counter reset.
                resultValue = lastSample.F
        } else {
                resultValue = lastSample.F - previousSample.F
        }

        sampledInterval := lastSample.T - previousSample.T
        if sampledInterval == 0 {
                // Avoid dividing by 0.
                return out, nil
        }

        if isRate {
                // Convert to per-second.
                resultValue /= float64(sampledInterval) / 1000
        }

        return append(out, Sample{F: resultValue}), nil
}

// Calculate the trend value at the given index i in raw data d.
// This is somewhat analogous to the slope of the trend at the given index.
// The argument "tf" is the trend factor.
// The argument "s0" is the computed smoothed value.
// The argument "s1" is the computed trend factor.
// The argument "b" is the raw input value.
func calcTrendValue(i int, tf, s0, s1, b float64) float64 {
        if i == 0 {
                return b
        }

        x := tf * (s1 - s0)
        y := (1 - tf) * b

        return x + y
}

// Holt-Winters is similar to a weighted moving average, where historical data has exponentially less influence on the current data.
// Holt-Winter also accounts for trends in data. The smoothing factor (0 < sf < 1) affects how historical data will affect the current
// data. A lower smoothing factor increases the influence of historical data. The trend factor (0 < tf < 1) affects
// how trends in historical data will affect the current data. A higher trend factor increases the influence.
// of trends. Algorithm taken from https://en.wikipedia.org/wiki/Exponential_smoothing titled: "Double exponential smoothing".
func funcHoltWinters(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        samples := vals[0].(Matrix)[0]

        // The smoothing factor argument.
        sf := vals[1].(Vector)[0].F

        // The trend factor argument.
        tf := vals[2].(Vector)[0].F

        // Check that the input parameters are valid.
        if sf <= 0 || sf >= 1 {
                panic(fmt.Errorf("invalid smoothing factor. Expected: 0 < sf < 1, got: %f", sf))
        }
        if tf <= 0 || tf >= 1 {
                panic(fmt.Errorf("invalid trend factor. Expected: 0 < tf < 1, got: %f", tf))
        }

        l := len(samples.Floats)

        // Can't do the smoothing operation with less than two points.
        if l < 2 {
                return enh.Out, nil
        }

        var s0, s1, b float64
        // Set initial values.
        s1 = samples.Floats[0].F
        b = samples.Floats[1].F - samples.Floats[0].F

        // Run the smoothing operation.
        var x, y float64
        for i := 1; i < l; i++ {
                // Scale the raw value against the smoothing factor.
                x = sf * samples.Floats[i].F

                // Scale the last smoothed value with the trend at this point.
                b = calcTrendValue(i-1, tf, s0, s1, b)
                y = (1 - sf) * (s1 + b)

                s0, s1 = s1, x+y
        }

        return append(enh.Out, Sample{F: s1}), nil
}

// === sort(node parser.ValueTypeVector) (Vector, Annotations) ===
func funcSort(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        // NaN should sort to the bottom, so take descending sort with NaN first and
        // reverse it.
        byValueSorter := vectorByReverseValueHeap(vals[0].(Vector))
        sort.Sort(sort.Reverse(byValueSorter))
        return Vector(byValueSorter), nil
}

// === sortDesc(node parser.ValueTypeVector) (Vector, Annotations) ===
func funcSortDesc(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        // NaN should sort to the bottom, so take ascending sort with NaN first and
        // reverse it.
        byValueSorter := vectorByValueHeap(vals[0].(Vector))
        sort.Sort(sort.Reverse(byValueSorter))
        return Vector(byValueSorter), nil
}

// === sort_by_label(vector parser.ValueTypeVector, label parser.ValueTypeString...) (Vector, Annotations) ===
func funcSortByLabel(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        // In case the labels are the same, NaN should sort to the bottom, so take
        // ascending sort with NaN first and reverse it.
        var anno annotations.Annotations
        vals[0], anno = funcSort(vals, args, enh)
        labels := stringSliceFromArgs(args[1:])
        slices.SortFunc(vals[0].(Vector), func(a, b Sample) int {
                // Iterate over each given label
                for _, label := range labels {
                        lv1 := a.Metric.Get(label)
                        lv2 := b.Metric.Get(label)

                        if lv1 == lv2 {
                                continue
                        }

                        if natsort.Compare(lv1, lv2) {
                                return -1
                        }

                        return +1
                }

                return 0
        })

        return vals[0].(Vector), anno
}

// === sort_by_label_desc(vector parser.ValueTypeVector, label parser.ValueTypeString...) (Vector, Annotations) ===
func funcSortByLabelDesc(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        // In case the labels are the same, NaN should sort to the bottom, so take
        // ascending sort with NaN first and reverse it.
        var anno annotations.Annotations
        vals[0], anno = funcSortDesc(vals, args, enh)
        labels := stringSliceFromArgs(args[1:])
        slices.SortFunc(vals[0].(Vector), func(a, b Sample) int {
                // Iterate over each given label
                for _, label := range labels {
                        lv1 := a.Metric.Get(label)
                        lv2 := b.Metric.Get(label)

                        if lv1 == lv2 {
                                continue
                        }

                        if natsort.Compare(lv1, lv2) {
                                return +1
                        }

                        return -1
                }

                return 0
        })

        return vals[0].(Vector), anno
}

// === clamp(Vector parser.ValueTypeVector, min, max Scalar) (Vector, Annotations) ===
func funcClamp(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        vec := vals[0].(Vector)
        min := vals[1].(Vector)[0].F
        max := vals[2].(Vector)[0].F
        if max < min {
                return enh.Out, nil
        }
        for _, el := range vec {
                enh.Out = append(enh.Out, Sample{
                        Metric: el.Metric.DropMetricName(),
                        F:      math.Max(min, math.Min(max, el.F)),
                })
        }
        return enh.Out, nil
}

// === clamp_max(Vector parser.ValueTypeVector, max Scalar) (Vector, Annotations) ===
func funcClampMax(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        vec := vals[0].(Vector)
        max := vals[1].(Vector)[0].F
        for _, el := range vec {
                enh.Out = append(enh.Out, Sample{
                        Metric: el.Metric.DropMetricName(),
                        F:      math.Min(max, el.F),
                })
        }
        return enh.Out, nil
}

// === clamp_min(Vector parser.ValueTypeVector, min Scalar) (Vector, Annotations) ===
func funcClampMin(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        vec := vals[0].(Vector)
        min := vals[1].(Vector)[0].F
        for _, el := range vec {
                enh.Out = append(enh.Out, Sample{
                        Metric: el.Metric.DropMetricName(),
                        F:      math.Max(min, el.F),
                })
        }
        return enh.Out, nil
}

// === round(Vector parser.ValueTypeVector, toNearest=1 Scalar) (Vector, Annotations) ===
func funcRound(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        vec := vals[0].(Vector)
        // round returns a number rounded to toNearest.
        // Ties are solved by rounding up.
        toNearest := float64(1)
        if len(args) >= 2 {
                toNearest = vals[1].(Vector)[0].F
        }
        // Invert as it seems to cause fewer floating point accuracy issues.
        toNearestInverse := 1.0 / toNearest

        for _, el := range vec {
                f := math.Floor(el.F*toNearestInverse+0.5) / toNearestInverse
                enh.Out = append(enh.Out, Sample{
                        Metric: el.Metric.DropMetricName(),
                        F:      f,
                })
        }
        return enh.Out, nil
}

// === Scalar(node parser.ValueTypeVector) Scalar ===
func funcScalar(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        v := vals[0].(Vector)
        if len(v) != 1 {
                return append(enh.Out, Sample{F: math.NaN()}), nil
        }
        return append(enh.Out, Sample{F: v[0].F}), nil
}

func aggrOverTime(vals []parser.Value, enh *EvalNodeHelper, aggrFn func(Series) float64) Vector {
        el := vals[0].(Matrix)[0]

        return append(enh.Out, Sample{F: aggrFn(el)})
}

func aggrHistOverTime(vals []parser.Value, enh *EvalNodeHelper, aggrFn func(Series) (*histogram.FloatHistogram, error)) (Vector, error) {
        el := vals[0].(Matrix)[0]
        res, err := aggrFn(el)

        return append(enh.Out, Sample{H: res}), err
}

// === avg_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations)  ===
func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        firstSeries := vals[0].(Matrix)[0]
        if len(firstSeries.Floats) > 0 && len(firstSeries.Histograms) > 0 {
                metricName := firstSeries.Metric.Get(labels.MetricName)
                return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
        }
        if len(firstSeries.Floats) == 0 {
                // The passed values only contain histograms.
                vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) {
                        count := 1
                        mean := s.Histograms[0].H.Copy()
                        for _, h := range s.Histograms[1:] {
                                count++
                                left := h.H.Copy().Div(float64(count))
                                right := mean.Copy().Div(float64(count))
                                toAdd, err := left.Sub(right)
                                if err != nil {
                                        return mean, err
                                }
                                _, err = mean.Add(toAdd)
                                if err != nil {
                                        return mean, err
                                }
                        }
                        return mean, nil
                })
                if err != nil {
                        metricName := firstSeries.Metric.Get(labels.MetricName)
                        if errors.Is(err, histogram.ErrHistogramsIncompatibleSchema) {
                                return enh.Out, annotations.New().Add(annotations.NewMixedExponentialCustomHistogramsWarning(metricName, args[0].PositionRange()))
                        } else if errors.Is(err, histogram.ErrHistogramsIncompatibleBounds) {
                                return enh.Out, annotations.New().Add(annotations.NewIncompatibleCustomBucketsHistogramsWarning(metricName, args[0].PositionRange()))
                        }
                }
                return vec, nil
        }
        return aggrOverTime(vals, enh, func(s Series) float64 {
                var mean, count, c float64
                for _, f := range s.Floats {
                        count++
                        if math.IsInf(mean, 0) {
                                if math.IsInf(f.F, 0) && (mean > 0) == (f.F > 0) {
                                        // The `mean` and `f.F` values are `Inf` of the same sign.  They
                                        // can't be subtracted, but the value of `mean` is correct
                                        // already.
                                        continue
                                }
                                if !math.IsInf(f.F, 0) && !math.IsNaN(f.F) {
                                        // At this stage, the mean is an infinite. If the added
                                        // value is neither an Inf or a Nan, we can keep that mean
                                        // value.
                                        // This is required because our calculation below removes
                                        // the mean value, which would look like Inf += x - Inf and
                                        // end up as a NaN.
                                        continue
                                }
                        }
                        mean, c = kahanSumInc(f.F/count-mean/count, mean, c)
                }

                if math.IsInf(mean, 0) {
                        return mean
                }
                return mean + c
        }), nil
}

// === count_over_time(Matrix parser.ValueTypeMatrix) (Vector, Notes)  ===
func funcCountOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return aggrOverTime(vals, enh, func(s Series) float64 {
                return float64(len(s.Floats) + len(s.Histograms))
        }), nil
}

// === last_over_time(Matrix parser.ValueTypeMatrix) (Vector, Notes)  ===
func funcLastOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        el := vals[0].(Matrix)[0]

        var f FPoint
        if len(el.Floats) > 0 {
                f = el.Floats[len(el.Floats)-1]
        }

        var h HPoint
        if len(el.Histograms) > 0 {
                h = el.Histograms[len(el.Histograms)-1]
        }

        if h.H == nil || h.T < f.T {
                return append(enh.Out, Sample{
                        Metric: el.Metric,
                        F:      f.F,
                }), nil
        }
        return append(enh.Out, Sample{
                Metric: el.Metric,
                H:      h.H.Copy(),
        }), nil
}

// === mad_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcMadOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        if len(vals[0].(Matrix)[0].Floats) == 0 {
                return enh.Out, nil
        }
        return aggrOverTime(vals, enh, func(s Series) float64 {
                values := make(vectorByValueHeap, 0, len(s.Floats))
                for _, f := range s.Floats {
                        values = append(values, Sample{F: f.F})
                }
                median := quantile(0.5, values)
                values = make(vectorByValueHeap, 0, len(s.Floats))
                for _, f := range s.Floats {
                        values = append(values, Sample{F: math.Abs(f.F - median)})
                }
                return quantile(0.5, values)
        }), nil
}

// === max_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcMaxOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        if len(vals[0].(Matrix)[0].Floats) == 0 {
                // TODO(beorn7): The passed values only contain
                // histograms. max_over_time ignores histograms for now. If
                // there are only histograms, we have to return without adding
                // anything to enh.Out.
                return enh.Out, nil
        }
        return aggrOverTime(vals, enh, func(s Series) float64 {
                max := s.Floats[0].F
                for _, f := range s.Floats {
                        if f.F > max || math.IsNaN(max) {
                                max = f.F
                        }
                }
                return max
        }), nil
}

// === min_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcMinOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        if len(vals[0].(Matrix)[0].Floats) == 0 {
                // TODO(beorn7): The passed values only contain
                // histograms. min_over_time ignores histograms for now. If
                // there are only histograms, we have to return without adding
                // anything to enh.Out.
                return enh.Out, nil
        }
        return aggrOverTime(vals, enh, func(s Series) float64 {
                min := s.Floats[0].F
                for _, f := range s.Floats {
                        if f.F < min || math.IsNaN(min) {
                                min = f.F
                        }
                }
                return min
        }), nil
}

// === sum_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcSumOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        firstSeries := vals[0].(Matrix)[0]
        if len(firstSeries.Floats) > 0 && len(firstSeries.Histograms) > 0 {
                metricName := firstSeries.Metric.Get(labels.MetricName)
                return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
        }
        if len(firstSeries.Floats) == 0 {
                // The passed values only contain histograms.
                vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) {
                        sum := s.Histograms[0].H.Copy()
                        for _, h := range s.Histograms[1:] {
                                _, err := sum.Add(h.H)
                                if err != nil {
                                        return sum, err
                                }
                        }
                        return sum, nil
                })
                if err != nil {
                        metricName := firstSeries.Metric.Get(labels.MetricName)
                        if errors.Is(err, histogram.ErrHistogramsIncompatibleSchema) {
                                return enh.Out, annotations.New().Add(annotations.NewMixedExponentialCustomHistogramsWarning(metricName, args[0].PositionRange()))
                        } else if errors.Is(err, histogram.ErrHistogramsIncompatibleBounds) {
                                return enh.Out, annotations.New().Add(annotations.NewIncompatibleCustomBucketsHistogramsWarning(metricName, args[0].PositionRange()))
                        }
                }
                return vec, nil
        }
        return aggrOverTime(vals, enh, func(s Series) float64 {
                var sum, c float64
                for _, f := range s.Floats {
                        sum, c = kahanSumInc(f.F, sum, c)
                }
                if math.IsInf(sum, 0) {
                        return sum
                }
                return sum + c
        }), nil
}

// === quantile_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcQuantileOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        q := vals[0].(Vector)[0].F
        el := vals[1].(Matrix)[0]
        if len(el.Floats) == 0 {
                // TODO(beorn7): The passed values only contain
                // histograms. quantile_over_time ignores histograms for now. If
                // there are only histograms, we have to return without adding
                // anything to enh.Out.
                return enh.Out, nil
        }

        var annos annotations.Annotations
        if math.IsNaN(q) || q < 0 || q > 1 {
                annos.Add(annotations.NewInvalidQuantileWarning(q, args[0].PositionRange()))
        }

        values := make(vectorByValueHeap, 0, len(el.Floats))
        for _, f := range el.Floats {
                values = append(values, Sample{F: f.F})
        }
        return append(enh.Out, Sample{F: quantile(q, values)}), annos
}

// === stddev_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcStddevOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        if len(vals[0].(Matrix)[0].Floats) == 0 {
                // TODO(beorn7): The passed values only contain
                // histograms. stddev_over_time ignores histograms for now. If
                // there are only histograms, we have to return without adding
                // anything to enh.Out.
                return enh.Out, nil
        }
        return aggrOverTime(vals, enh, func(s Series) float64 {
                var count float64
                var mean, cMean float64
                var aux, cAux float64
                for _, f := range s.Floats {
                        count++
                        delta := f.F - (mean + cMean)
                        mean, cMean = kahanSumInc(delta/count, mean, cMean)
                        aux, cAux = kahanSumInc(delta*(f.F-(mean+cMean)), aux, cAux)
                }
                return math.Sqrt((aux + cAux) / count)
        }), nil
}

// === stdvar_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcStdvarOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        if len(vals[0].(Matrix)[0].Floats) == 0 {
                // TODO(beorn7): The passed values only contain
                // histograms. stdvar_over_time ignores histograms for now. If
                // there are only histograms, we have to return without adding
                // anything to enh.Out.
                return enh.Out, nil
        }
        return aggrOverTime(vals, enh, func(s Series) float64 {
                var count float64
                var mean, cMean float64
                var aux, cAux float64
                for _, f := range s.Floats {
                        count++
                        delta := f.F - (mean + cMean)
                        mean, cMean = kahanSumInc(delta/count, mean, cMean)
                        aux, cAux = kahanSumInc(delta*(f.F-(mean+cMean)), aux, cAux)
                }
                return (aux + cAux) / count
        }), nil
}

// === absent(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcAbsent(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        if len(vals[0].(Vector)) > 0 {
                return enh.Out, nil
        }
        return append(enh.Out,
                Sample{
                        Metric: createLabelsForAbsentFunction(args[0]),
                        F:      1,
                }), nil
}

// === absent_over_time(Vector parser.ValueTypeMatrix) (Vector, Annotations) ===
// As this function has a matrix as argument, it does not get all the Series.
// This function will return 1 if the matrix has at least one element.
// Due to engine optimization, this function is only called when this condition is true.
// Then, the engine post-processes the results to get the expected output.
func funcAbsentOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return append(enh.Out, Sample{F: 1}), nil
}

// === present_over_time(Vector parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcPresentOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return aggrOverTime(vals, enh, func(s Series) float64 {
                return 1
        }), nil
}

func simpleFunc(vals []parser.Value, enh *EvalNodeHelper, f func(float64) float64) Vector {
        for _, el := range vals[0].(Vector) {
                if el.H == nil { // Process only float samples.
                        enh.Out = append(enh.Out, Sample{
                                Metric: el.Metric.DropMetricName(),
                                F:      f(el.F),
                        })
                }
        }
        return enh.Out
}

// === abs(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcAbs(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Abs), nil
}

// === ceil(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcCeil(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Ceil), nil
}

// === floor(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcFloor(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Floor), nil
}

// === exp(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcExp(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Exp), nil
}

// === sqrt(Vector VectorNode) (Vector, Annotations) ===
func funcSqrt(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Sqrt), nil
}

// === ln(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcLn(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Log), nil
}

// === log2(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcLog2(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Log2), nil
}

// === log10(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcLog10(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Log10), nil
}

// === sin(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcSin(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Sin), nil
}

// === cos(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcCos(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Cos), nil
}

// === tan(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcTan(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Tan), nil
}

// === asin(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcAsin(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Asin), nil
}

// === acos(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcAcos(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Acos), nil
}

// === atan(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcAtan(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Atan), nil
}

// === sinh(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcSinh(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Sinh), nil
}

// === cosh(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcCosh(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Cosh), nil
}

// === tanh(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcTanh(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Tanh), nil
}

// === asinh(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcAsinh(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Asinh), nil
}

// === acosh(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcAcosh(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Acosh), nil
}

// === atanh(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcAtanh(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, math.Atanh), nil
}

// === rad(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcRad(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, func(v float64) float64 {
                return v * math.Pi / 180
        }), nil
}

// === deg(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcDeg(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, func(v float64) float64 {
                return v * 180 / math.Pi
        }), nil
}

// === pi() Scalar ===
func funcPi(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return Vector{Sample{F: math.Pi}}, nil
}

// === sgn(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcSgn(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return simpleFunc(vals, enh, func(v float64) float64 {
                switch {
                case v < 0:
                        return -1
                case v > 0:
                        return 1
                default:
                        return v
                }
        }), nil
}

// === timestamp(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcTimestamp(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        vec := vals[0].(Vector)
        for _, el := range vec {
                enh.Out = append(enh.Out, Sample{
                        Metric: el.Metric.DropMetricName(),
                        F:      float64(el.T) / 1000,
                })
        }
        return enh.Out, nil
}

func kahanSumInc(inc, sum, c float64) (newSum, newC float64) {
        t := sum + inc
        // Using Neumaier improvement, swap if next term larger than sum.
        if math.Abs(sum) >= math.Abs(inc) {
                c += (sum - t) + inc
        } else {
                c += (inc - t) + sum
        }
        return t, c
}

// linearRegression performs a least-square linear regression analysis on the
// provided SamplePairs. It returns the slope, and the intercept value at the
// provided time.
func linearRegression(samples []FPoint, interceptTime int64) (slope, intercept float64) {
        var (
                n          float64
                sumX, cX   float64
                sumY, cY   float64
                sumXY, cXY float64
                sumX2, cX2 float64
                initY      float64
                constY     bool
        )
        initY = samples[0].F
        constY = true
        for i, sample := range samples {
                // Set constY to false if any new y values are encountered.
                if constY && i > 0 && sample.F != initY {
                        constY = false
                }
                n += 1.0
                x := float64(sample.T-interceptTime) / 1e3
                sumX, cX = kahanSumInc(x, sumX, cX)
                sumY, cY = kahanSumInc(sample.F, sumY, cY)
                sumXY, cXY = kahanSumInc(x*sample.F, sumXY, cXY)
                sumX2, cX2 = kahanSumInc(x*x, sumX2, cX2)
        }
        if constY {
                if math.IsInf(initY, 0) {
                        return math.NaN(), math.NaN()
                }
                return 0, initY
        }
        sumX += cX
        sumY += cY
        sumXY += cXY
        sumX2 += cX2

        covXY := sumXY - sumX*sumY/n
        varX := sumX2 - sumX*sumX/n

        slope = covXY / varX
        intercept = sumY/n - slope*sumX/n
        return slope, intercept
}

// === deriv(node parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcDeriv(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        samples := vals[0].(Matrix)[0]

        // No sense in trying to compute a derivative without at least two points.
        // Drop this Vector element.
        if len(samples.Floats) < 2 {
                return enh.Out, nil
        }

        // We pass in an arbitrary timestamp that is near the values in use
        // to avoid floating point accuracy issues, see
        // https://github.com/prometheus/prometheus/issues/2674
        slope, _ := linearRegression(samples.Floats, samples.Floats[0].T)
        return append(enh.Out, Sample{F: slope}), nil
}

// === predict_linear(node parser.ValueTypeMatrix, k parser.ValueTypeScalar) (Vector, Annotations) ===
func funcPredictLinear(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        samples := vals[0].(Matrix)[0]
        duration := vals[1].(Vector)[0].F
        // No sense in trying to predict anything without at least two points.
        // Drop this Vector element.
        if len(samples.Floats) < 2 {
                return enh.Out, nil
        }
        slope, intercept := linearRegression(samples.Floats, enh.Ts)

        return append(enh.Out, Sample{F: slope*duration + intercept}), nil
}

// === histogram_count(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcHistogramCount(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        inVec := vals[0].(Vector)

        for _, sample := range inVec {
                // Skip non-histogram samples.
                if sample.H == nil {
                        continue
                }
                enh.Out = append(enh.Out, Sample{
                        Metric: sample.Metric.DropMetricName(),
                        F:      sample.H.Count,
                })
        }
        return enh.Out, nil
}

// === histogram_sum(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcHistogramSum(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        inVec := vals[0].(Vector)

        for _, sample := range inVec {
                // Skip non-histogram samples.
                if sample.H == nil {
                        continue
                }
                enh.Out = append(enh.Out, Sample{
                        Metric: sample.Metric.DropMetricName(),
                        F:      sample.H.Sum,
                })
        }
        return enh.Out, nil
}

// === histogram_avg(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcHistogramAvg(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        inVec := vals[0].(Vector)

        for _, sample := range inVec {
                // Skip non-histogram samples.
                if sample.H == nil {
                        continue
                }
                enh.Out = append(enh.Out, Sample{
                        Metric: sample.Metric.DropMetricName(),
                        F:      sample.H.Sum / sample.H.Count,
                })
        }
        return enh.Out, nil
}

// === histogram_stddev(Vector parser.ValueTypeVector) (Vector, Annotations)  ===
func funcHistogramStdDev(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        inVec := vals[0].(Vector)

        for _, sample := range inVec {
                // Skip non-histogram samples.
                if sample.H == nil {
                        continue
                }
                mean := sample.H.Sum / sample.H.Count
                var variance, cVariance float64
                it := sample.H.AllBucketIterator()
                for it.Next() {
                        bucket := it.At()
                        if bucket.Count == 0 {
                                continue
                        }
                        var val float64
                        if bucket.Lower <= 0 && 0 <= bucket.Upper {
                                val = 0
                        } else {
                                val = math.Sqrt(bucket.Upper * bucket.Lower)
                                if bucket.Upper < 0 {
                                        val = -val
                                }
                        }
                        delta := val - mean
                        variance, cVariance = kahanSumInc(bucket.Count*delta*delta, variance, cVariance)
                }
                variance += cVariance
                variance /= sample.H.Count
                enh.Out = append(enh.Out, Sample{
                        Metric: sample.Metric.DropMetricName(),
                        F:      math.Sqrt(variance),
                })
        }
        return enh.Out, nil
}

// === histogram_stdvar(Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcHistogramStdVar(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        inVec := vals[0].(Vector)

        for _, sample := range inVec {
                // Skip non-histogram samples.
                if sample.H == nil {
                        continue
                }
                mean := sample.H.Sum / sample.H.Count
                var variance, cVariance float64
                it := sample.H.AllBucketIterator()
                for it.Next() {
                        bucket := it.At()
                        if bucket.Count == 0 {
                                continue
                        }
                        var val float64
                        if bucket.Lower <= 0 && 0 <= bucket.Upper {
                                val = 0
                        } else {
                                val = math.Sqrt(bucket.Upper * bucket.Lower)
                                if bucket.Upper < 0 {
                                        val = -val
                                }
                        }
                        delta := val - mean
                        variance, cVariance = kahanSumInc(bucket.Count*delta*delta, variance, cVariance)
                }
                variance += cVariance
                variance /= sample.H.Count
                enh.Out = append(enh.Out, Sample{
                        Metric: sample.Metric.DropMetricName(),
                        F:      variance,
                })
        }
        return enh.Out, nil
}

// === histogram_fraction(lower, upper parser.ValueTypeScalar, Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcHistogramFraction(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        lower := vals[0].(Vector)[0].F
        upper := vals[1].(Vector)[0].F
        inVec := vals[2].(Vector)

        for _, sample := range inVec {
                // Skip non-histogram samples.
                if sample.H == nil {
                        continue
                }
                enh.Out = append(enh.Out, Sample{
                        Metric: sample.Metric.DropMetricName(),
                        F:      histogramFraction(lower, upper, sample.H),
                })
        }
        return enh.Out, nil
}

// === histogram_quantile(k parser.ValueTypeScalar, Vector parser.ValueTypeVector) (Vector, Annotations) ===
func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        q := vals[0].(Vector)[0].F
        inVec := vals[1].(Vector)
        var annos annotations.Annotations

        if math.IsNaN(q) || q < 0 || q > 1 {
                annos.Add(annotations.NewInvalidQuantileWarning(q, args[0].PositionRange()))
        }

        if enh.signatureToMetricWithBuckets == nil {
                enh.signatureToMetricWithBuckets = map[string]*metricWithBuckets{}
        } else {
                for _, v := range enh.signatureToMetricWithBuckets {
                        v.buckets = v.buckets[:0]
                }
        }

        var histogramSamples []Sample

        for _, sample := range inVec {
                // We are only looking for classic buckets here. Remember
                // the histograms for later treatment.
                if sample.H != nil {
                        histogramSamples = append(histogramSamples, sample)
                        continue
                }

                upperBound, err := strconv.ParseFloat(
                        sample.Metric.Get(model.BucketLabel), 64,
                )
                if err != nil {
                        annos.Add(annotations.NewBadBucketLabelWarning(sample.Metric.Get(labels.MetricName), sample.Metric.Get(model.BucketLabel), args[1].PositionRange()))
                        continue
                }
                enh.lblBuf = sample.Metric.BytesWithoutLabels(enh.lblBuf, labels.BucketLabel)
                mb, ok := enh.signatureToMetricWithBuckets[string(enh.lblBuf)]
                if !ok {
                        sample.Metric = labels.NewBuilder(sample.Metric).
                                Del(excludedLabels...).
                                Labels()

                        mb = &metricWithBuckets{sample.Metric, nil}
                        enh.signatureToMetricWithBuckets[string(enh.lblBuf)] = mb
                }
                mb.buckets = append(mb.buckets, bucket{upperBound, sample.F})
        }

        // Now deal with the histograms.
        for _, sample := range histogramSamples {
                // We have to reconstruct the exact same signature as above for
                // a classic histogram, just ignoring any le label.
                enh.lblBuf = sample.Metric.Bytes(enh.lblBuf)
                if mb, ok := enh.signatureToMetricWithBuckets[string(enh.lblBuf)]; ok && len(mb.buckets) > 0 {
                        // At this data point, we have classic histogram
                        // buckets and a native histogram with the same name and
                        // labels. Do not evaluate anything.
                        annos.Add(annotations.NewMixedClassicNativeHistogramsWarning(sample.Metric.Get(labels.MetricName), args[1].PositionRange()))
                        delete(enh.signatureToMetricWithBuckets, string(enh.lblBuf))
                        continue
                }

                enh.Out = append(enh.Out, Sample{
                        Metric: sample.Metric.DropMetricName(),
                        F:      histogramQuantile(q, sample.H),
                })
        }

        for _, mb := range enh.signatureToMetricWithBuckets {
                if len(mb.buckets) > 0 {
                        res, forcedMonotonicity, _ := bucketQuantile(q, mb.buckets)
                        enh.Out = append(enh.Out, Sample{
                                Metric: mb.metric,
                                F:      res,
                        })
                        if forcedMonotonicity {
                                annos.Add(annotations.NewHistogramQuantileForcedMonotonicityInfo(mb.metric.Get(labels.MetricName), args[1].PositionRange()))
                        }
                }
        }

        return enh.Out, annos
}

// === resets(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcResets(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        floats := vals[0].(Matrix)[0].Floats
        histograms := vals[0].(Matrix)[0].Histograms
        resets := 0

        if len(floats) > 1 {
                prev := floats[0].F
                for _, sample := range floats[1:] {
                        current := sample.F
                        if current < prev {
                                resets++
                        }
                        prev = current
                }
        }

        if len(histograms) > 1 {
                prev := histograms[0].H
                for _, sample := range histograms[1:] {
                        current := sample.H
                        if current.DetectReset(prev) {
                                resets++
                        }
                        prev = current
                }
        }

        return append(enh.Out, Sample{F: float64(resets)}), nil
}

// === changes(Matrix parser.ValueTypeMatrix) (Vector, Annotations) ===
func funcChanges(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        floats := vals[0].(Matrix)[0].Floats
        changes := 0

        if len(floats) == 0 {
                // TODO(beorn7): Only histogram values, still need to add support.
                return enh.Out, nil
        }

        prev := floats[0].F
        for _, sample := range floats[1:] {
                current := sample.F
                if current != prev && !(math.IsNaN(current) && math.IsNaN(prev)) {
                        changes++
                }
                prev = current
        }

        return append(enh.Out, Sample{F: float64(changes)}), nil
}

// label_replace function operates only on series; does not look at timestamps or values.
func (ev *evaluator) evalLabelReplace(args parser.Expressions) (parser.Value, annotations.Annotations) {
        var (
                dst      = stringFromArg(args[1])
                repl     = stringFromArg(args[2])
                src      = stringFromArg(args[3])
                regexStr = stringFromArg(args[4])
        )

        regex, err := regexp.Compile("^(?:" + regexStr + ")$")
        if err != nil {
                panic(fmt.Errorf("invalid regular expression in label_replace(): %s", regexStr))
        }
        if !model.LabelNameRE.MatchString(dst) {
                panic(fmt.Errorf("invalid destination label name in label_replace(): %s", dst))
        }

        val, ws := ev.eval(args[0])
        matrix := val.(Matrix)
        lb := labels.NewBuilder(labels.EmptyLabels())

        for i, el := range matrix {
                srcVal := el.Metric.Get(src)
                indexes := regex.FindStringSubmatchIndex(srcVal)
                if indexes != nil { // Only replace when regexp matches.
                        res := regex.ExpandString([]byte{}, repl, srcVal, indexes)
                        lb.Reset(el.Metric)
                        lb.Set(dst, string(res))
                        matrix[i].Metric = lb.Labels()
                }
        }
        if matrix.ContainsSameLabelset() {
                ev.errorf("vector cannot contain metrics with the same labelset")
        }

        return matrix, ws
}

// === label_replace(Vector parser.ValueTypeVector, dst_label, replacement, src_labelname, regex parser.ValueTypeString) (Vector, Annotations) ===
func funcLabelReplace(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        panic("funcLabelReplace wrong implementation called")
}

// === Vector(s Scalar) (Vector, Annotations) ===
func funcVector(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return append(enh.Out,
                Sample{
                        Metric: labels.Labels{},
                        F:      vals[0].(Vector)[0].F,
                }), nil
}

// label_join function operates only on series; does not look at timestamps or values.
func (ev *evaluator) evalLabelJoin(args parser.Expressions) (parser.Value, annotations.Annotations) {
        var (
                dst       = stringFromArg(args[1])
                sep       = stringFromArg(args[2])
                srcLabels = make([]string, len(args)-3)
        )
        for i := 3; i < len(args); i++ {
                src := stringFromArg(args[i])
                if !model.LabelName(src).IsValid() {
                        panic(fmt.Errorf("invalid source label name in label_join(): %s", src))
                }
                srcLabels[i-3] = src
        }
        if !model.LabelName(dst).IsValid() {
                panic(fmt.Errorf("invalid destination label name in label_join(): %s", dst))
        }

        val, ws := ev.eval(args[0])
        matrix := val.(Matrix)
        srcVals := make([]string, len(srcLabels))
        lb := labels.NewBuilder(labels.EmptyLabels())

        for i, el := range matrix {
                for i, src := range srcLabels {
                        srcVals[i] = el.Metric.Get(src)
                }
                strval := strings.Join(srcVals, sep)
                lb.Reset(el.Metric)
                lb.Set(dst, strval)
                matrix[i].Metric = lb.Labels()
        }

        return matrix, ws
}

// === label_join(vector model.ValVector, dest_labelname, separator, src_labelname...) (Vector, Annotations) ===
func funcLabelJoin(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        panic("funcLabelReplace wrong implementation called")
}

// Common code for date related functions.
func dateWrapper(vals []parser.Value, enh *EvalNodeHelper, f func(time.Time) float64) Vector {
        if len(vals) == 0 {
                return append(enh.Out,
                        Sample{
                                Metric: labels.Labels{},
                                F:      f(time.Unix(enh.Ts/1000, 0).UTC()),
                        })
        }

        for _, el := range vals[0].(Vector) {
                t := time.Unix(int64(el.F), 0).UTC()
                enh.Out = append(enh.Out, Sample{
                        Metric: el.Metric.DropMetricName(),
                        F:      f(t),
                })
        }
        return enh.Out
}

// === days_in_month(v Vector) Scalar ===
func funcDaysInMonth(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return dateWrapper(vals, enh, func(t time.Time) float64 {
                return float64(32 - time.Date(t.Year(), t.Month(), 32, 0, 0, 0, 0, time.UTC).Day())
        }), nil
}

// === day_of_month(v Vector) Scalar ===
func funcDayOfMonth(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return dateWrapper(vals, enh, func(t time.Time) float64 {
                return float64(t.Day())
        }), nil
}

// === day_of_week(v Vector) Scalar ===
func funcDayOfWeek(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return dateWrapper(vals, enh, func(t time.Time) float64 {
                return float64(t.Weekday())
        }), nil
}

// === day_of_year(v Vector) Scalar ===
func funcDayOfYear(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return dateWrapper(vals, enh, func(t time.Time) float64 {
                return float64(t.YearDay())
        }), nil
}

// === hour(v Vector) Scalar ===
func funcHour(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return dateWrapper(vals, enh, func(t time.Time) float64 {
                return float64(t.Hour())
        }), nil
}

// === minute(v Vector) Scalar ===
func funcMinute(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return dateWrapper(vals, enh, func(t time.Time) float64 {
                return float64(t.Minute())
        }), nil
}

// === month(v Vector) Scalar ===
func funcMonth(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return dateWrapper(vals, enh, func(t time.Time) float64 {
                return float64(t.Month())
        }), nil
}

// === year(v Vector) Scalar ===
func funcYear(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) {
        return dateWrapper(vals, enh, func(t time.Time) float64 {
                return float64(t.Year())
        }), nil
}

// FunctionCalls is a list of all functions supported by PromQL, including their types.
var FunctionCalls = map[string]FunctionCall{
        "abs":                funcAbs,
        "absent":             funcAbsent,
        "absent_over_time":   funcAbsentOverTime,
        "acos":               funcAcos,
        "acosh":              funcAcosh,
        "asin":               funcAsin,
        "asinh":              funcAsinh,
        "atan":               funcAtan,
        "atanh":              funcAtanh,
        "avg_over_time":      funcAvgOverTime,
        "ceil":               funcCeil,
        "changes":            funcChanges,
        "clamp":              funcClamp,
        "clamp_max":          funcClampMax,
        "clamp_min":          funcClampMin,
        "cos":                funcCos,
        "cosh":               funcCosh,
        "count_over_time":    funcCountOverTime,
        "days_in_month":      funcDaysInMonth,
        "day_of_month":       funcDayOfMonth,
        "day_of_week":        funcDayOfWeek,
        "day_of_year":        funcDayOfYear,
        "deg":                funcDeg,
        "delta":              funcDelta,
        "deriv":              funcDeriv,
        "exp":                funcExp,
        "floor":              funcFloor,
        "histogram_avg":      funcHistogramAvg,
        "histogram_count":    funcHistogramCount,
        "histogram_fraction": funcHistogramFraction,
        "histogram_quantile": funcHistogramQuantile,
        "histogram_sum":      funcHistogramSum,
        "histogram_stddev":   funcHistogramStdDev,
        "histogram_stdvar":   funcHistogramStdVar,
        "holt_winters":       funcHoltWinters,
        "hour":               funcHour,
        "idelta":             funcIdelta,
        "increase":           funcIncrease,
        "irate":              funcIrate,
        "label_replace":      funcLabelReplace,
        "label_join":         funcLabelJoin,
        "ln":                 funcLn,
        "log10":              funcLog10,
        "log2":               funcLog2,
        "last_over_time":     funcLastOverTime,
        "mad_over_time":      funcMadOverTime,
        "max_over_time":      funcMaxOverTime,
        "min_over_time":      funcMinOverTime,
        "minute":             funcMinute,
        "month":              funcMonth,
        "pi":                 funcPi,
        "predict_linear":     funcPredictLinear,
        "present_over_time":  funcPresentOverTime,
        "quantile_over_time": funcQuantileOverTime,
        "rad":                funcRad,
        "rate":               funcRate,
        "resets":             funcResets,
        "round":              funcRound,
        "scalar":             funcScalar,
        "sgn":                funcSgn,
        "sin":                funcSin,
        "sinh":               funcSinh,
        "sort":               funcSort,
        "sort_desc":          funcSortDesc,
        "sort_by_label":      funcSortByLabel,
        "sort_by_label_desc": funcSortByLabelDesc,
        "sqrt":               funcSqrt,
        "stddev_over_time":   funcStddevOverTime,
        "stdvar_over_time":   funcStdvarOverTime,
        "sum_over_time":      funcSumOverTime,
        "tan":                funcTan,
        "tanh":               funcTanh,
        "time":               funcTime,
        "timestamp":          funcTimestamp,
        "vector":             funcVector,
        "year":               funcYear,
}

// AtModifierUnsafeFunctions are the functions whose result
// can vary if evaluation time is changed when the arguments are
// step invariant. It also includes functions that use the timestamps
// of the passed instant vector argument to calculate a result since
// that can also change with change in eval time.
var AtModifierUnsafeFunctions = map[string]struct{}{
        // Step invariant functions.
        "days_in_month": {}, "day_of_month": {}, "day_of_week": {}, "day_of_year": {},
        "hour": {}, "minute": {}, "month": {}, "year": {},
        "predict_linear": {}, "time": {},
        // Uses timestamp of the argument for the result,
        // hence unsafe to use with @ modifier.
        "timestamp": {},
}

type vectorByValueHeap Vector

func (s vectorByValueHeap) Len() int {
        return len(s)
}

func (s vectorByValueHeap) Less(i, j int) bool {
        // We compare histograms based on their sum of observations.
        // TODO(beorn7): Is that what we want?
        vi, vj := s[i].F, s[j].F
        if s[i].H != nil {
                vi = s[i].H.Sum
        }
        if s[j].H != nil {
                vj = s[j].H.Sum
        }

        if math.IsNaN(vi) {
                return true
        }
        return vi < vj
}

func (s vectorByValueHeap) Swap(i, j int) {
        s[i], s[j] = s[j], s[i]
}

func (s *vectorByValueHeap) Push(x interface{}) {
        *s = append(*s, *(x.(*Sample)))
}

func (s *vectorByValueHeap) Pop() interface{} {
        old := *s
        n := len(old)
        el := old[n-1]
        *s = old[0 : n-1]
        return el
}

type vectorByReverseValueHeap Vector

func (s vectorByReverseValueHeap) Len() int {
        return len(s)
}

func (s vectorByReverseValueHeap) Less(i, j int) bool {
        // We compare histograms based on their sum of observations.
        // TODO(beorn7): Is that what we want?
        vi, vj := s[i].F, s[j].F
        if s[i].H != nil {
                vi = s[i].H.Sum
        }
        if s[j].H != nil {
                vj = s[j].H.Sum
        }

        if math.IsNaN(vi) {
                return true
        }
        return vi > vj
}

func (s vectorByReverseValueHeap) Swap(i, j int) {
        s[i], s[j] = s[j], s[i]
}

func (s *vectorByReverseValueHeap) Push(x interface{}) {
        *s = append(*s, *(x.(*Sample)))
}

func (s *vectorByReverseValueHeap) Pop() interface{} {
        old := *s
        n := len(old)
        el := old[n-1]
        *s = old[0 : n-1]
        return el
}

// createLabelsForAbsentFunction returns the labels that are uniquely and exactly matched
// in a given expression. It is used in the absent functions.
func createLabelsForAbsentFunction(expr parser.Expr) labels.Labels {
        b := labels.NewBuilder(labels.EmptyLabels())

        var lm []*labels.Matcher
        switch n := expr.(type) {
        case *parser.VectorSelector:
                lm = n.LabelMatchers
        case *parser.MatrixSelector:
                lm = n.VectorSelector.(*parser.VectorSelector).LabelMatchers
        default:
                return labels.EmptyLabels()
        }

        // The 'has' map implements backwards-compatibility for historic behaviour:
        // e.g. in `absent(x{job="a",job="b",foo="bar"})` then `job` is removed from the output.
        // Note this gives arguably wrong behaviour for `absent(x{job="a",job="a",foo="bar"})`.
        has := make(map[string]bool, len(lm))
        for _, ma := range lm {
                if ma.Name == labels.MetricName {
                        continue
                }
                if ma.Type == labels.MatchEqual && !has[ma.Name] {
                        b.Set(ma.Name, ma.Value)
                        has[ma.Name] = true
                } else {
                        b.Del(ma.Name)
                }
        }

        return b.Labels()
}

func stringFromArg(e parser.Expr) string {
        tmp := unwrapStepInvariantExpr(e) // Unwrap StepInvariant
        unwrapParenExpr(&tmp)             // Optionally unwrap ParenExpr
        return tmp.(*parser.StringLiteral).Val
}

func stringSliceFromArgs(args parser.Expressions) []string {
        tmp := make([]string, len(args))
        for i := 0; i < len(args); i++ {
                tmp[i] = stringFromArg(args[i])
        }
        return tmp
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Only build when go-fuzz is in use
//go:build gofuzz

package promql

import (
        "errors"
        "io"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/textparse"
        "github.com/prometheus/prometheus/promql/parser"
)

// PromQL parser fuzzing instrumentation for use with
// https://github.com/dvyukov/go-fuzz.
//
// Fuzz each parser by building appropriately instrumented parser, ex.
// FuzzParseMetric and execute it with it's
//
//     go-fuzz-build -func FuzzParseMetric -o FuzzParseMetric.zip github.com/prometheus/prometheus/promql
//
// And then run the tests with the appropriate inputs
//
//     go-fuzz -bin FuzzParseMetric.zip -workdir fuzz-data/ParseMetric
//
// Further input samples should go in the folders fuzz-data/ParseMetric/corpus.
//
// Repeat for FuzzParseOpenMetric, FuzzParseMetricSelector and FuzzParseExpr.

// Tuning which value is returned from Fuzz*-functions has a strong influence
// on how quick the fuzzer converges on "interesting" cases. At least try
// switching between fuzzMeh (= included in corpus, but not a priority) and
// fuzzDiscard (=don't use this input for re-building later inputs) when
// experimenting.
const (
        fuzzInteresting = 1
        fuzzMeh         = 0
        fuzzDiscard     = -1

        // Input size above which we know that Prometheus would consume too much
        // memory. The recommended way to deal with it is check input size.
        // https://google.github.io/oss-fuzz/getting-started/new-project-guide/#input-size
        maxInputSize = 10240
)

// Use package-scope symbol table to avoid memory allocation on every fuzzing operation.
var symbolTable = labels.NewSymbolTable()

func fuzzParseMetricWithContentType(in []byte, contentType string) int {
        p, warning := textparse.New(in, contentType, false, symbolTable)
        if warning != nil {
                // An invalid content type is being passed, which should not happen
                // in this context.
                panic(warning)
        }

        var err error
        for {
                _, err = p.Next()
                if err != nil {
                        break
                }
        }
        if errors.Is(err, io.EOF) {
                err = nil
        }

        if err == nil {
                return fuzzInteresting
        }

        return fuzzMeh
}

// Fuzz the metric parser.
//
// Note that this is not the parser for the text-based exposition-format; that
// lives in github.com/prometheus/client_golang/text.
func FuzzParseMetric(in []byte) int {
        return fuzzParseMetricWithContentType(in, "")
}

func FuzzParseOpenMetric(in []byte) int {
        return fuzzParseMetricWithContentType(in, "application/openmetrics-text")
}

// Fuzz the metric selector parser.
func FuzzParseMetricSelector(in []byte) int {
        if len(in) > maxInputSize {
                return fuzzMeh
        }
        _, err := parser.ParseMetricSelector(string(in))
        if err == nil {
                return fuzzInteresting
        }

        return fuzzMeh
}

// Fuzz the expression parser.
func FuzzParseExpr(in []byte) int {
        if len(in) > maxInputSize {
                return fuzzMeh
        }
        _, err := parser.ParseExpr(string(in))
        if err == nil {
                return fuzzInteresting
        }

        return fuzzMeh
}

// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package promql

import (
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/value"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
)

type histogramStatsIterator struct {
        chunkenc.Iterator

        currentH *histogram.Histogram
        lastH    *histogram.Histogram

        currentFH *histogram.FloatHistogram
        lastFH    *histogram.FloatHistogram
}

// NewHistogramStatsIterator creates an iterator which returns histogram objects
// which have only their sum and count values populated. The iterator handles
// counter reset detection internally and sets the counter reset hint accordingly
// in each returned histogram objects.
func NewHistogramStatsIterator(it chunkenc.Iterator) chunkenc.Iterator {
        return &histogramStatsIterator{
                Iterator:  it,
                currentH:  &histogram.Histogram{},
                currentFH: &histogram.FloatHistogram{},
        }
}

// AtHistogram returns the next timestamp/histogram pair. The counter reset
// detection is guaranteed to be correct only when the caller does not switch
// between AtHistogram and AtFloatHistogram calls.
func (f *histogramStatsIterator) AtHistogram(h *histogram.Histogram) (int64, *histogram.Histogram) {
        var t int64
        t, f.currentH = f.Iterator.AtHistogram(f.currentH)
        if value.IsStaleNaN(f.currentH.Sum) {
                f.setLastH(f.currentH)
                h = &histogram.Histogram{Sum: f.currentH.Sum}
                return t, h
        }

        if h == nil {
                h = &histogram.Histogram{
                        CounterResetHint: f.getResetHint(f.currentH),
                        Count:            f.currentH.Count,
                        Sum:              f.currentH.Sum,
                }
                f.setLastH(f.currentH)
                return t, h
        }

        h.CounterResetHint = f.getResetHint(f.currentH)
        h.Count = f.currentH.Count
        h.Sum = f.currentH.Sum
        f.setLastH(f.currentH)
        return t, h
}

// AtFloatHistogram returns the next timestamp/float histogram pair. The counter
// reset detection is guaranteed to be correct only when the caller does not
// switch between AtHistogram and AtFloatHistogram calls.
func (f *histogramStatsIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        var t int64
        t, f.currentFH = f.Iterator.AtFloatHistogram(f.currentFH)
        if value.IsStaleNaN(f.currentFH.Sum) {
                f.setLastFH(f.currentFH)
                return t, &histogram.FloatHistogram{Sum: f.currentFH.Sum}
        }

        if fh == nil {
                fh = &histogram.FloatHistogram{
                        CounterResetHint: f.getFloatResetHint(f.currentFH.CounterResetHint),
                        Count:            f.currentFH.Count,
                        Sum:              f.currentFH.Sum,
                }
                f.setLastFH(f.currentFH)
                return t, fh
        }

        fh.CounterResetHint = f.getFloatResetHint(f.currentFH.CounterResetHint)
        fh.Count = f.currentFH.Count
        fh.Sum = f.currentFH.Sum
        f.setLastFH(f.currentFH)
        return t, fh
}

func (f *histogramStatsIterator) setLastH(h *histogram.Histogram) {
        if f.lastH == nil {
                f.lastH = h.Copy()
        } else {
                h.CopyTo(f.lastH)
        }
}

func (f *histogramStatsIterator) setLastFH(fh *histogram.FloatHistogram) {
        if f.lastFH == nil {
                f.lastFH = fh.Copy()
        } else {
                fh.CopyTo(f.lastFH)
        }
}

func (f *histogramStatsIterator) getFloatResetHint(hint histogram.CounterResetHint) histogram.CounterResetHint {
        if hint != histogram.UnknownCounterReset {
                return hint
        }
        if f.lastFH == nil {
                return histogram.NotCounterReset
        }

        if f.currentFH.DetectReset(f.lastFH) {
                return histogram.CounterReset
        }
        return histogram.NotCounterReset
}

func (f *histogramStatsIterator) getResetHint(h *histogram.Histogram) histogram.CounterResetHint {
        if h.CounterResetHint != histogram.UnknownCounterReset {
                return h.CounterResetHint
        }
        if f.lastH == nil {
                return histogram.NotCounterReset
        }

        fh, prevFH := h.ToFloat(nil), f.lastH.ToFloat(nil)
        if fh.DetectReset(prevFH) {
                return histogram.CounterReset
        }
        return histogram.NotCounterReset
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package parser

import (
        "context"
        "fmt"
        "time"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"

        "github.com/prometheus/prometheus/promql/parser/posrange"
)

// Node is a generic interface for all nodes in an AST.
//
// Whenever numerous nodes are listed such as in a switch-case statement
// or a chain of function definitions (e.g. String(), PromQLExpr(), etc.) convention is
// to list them as follows:
//
//   - Statements
//   - statement types (alphabetical)
//   - ...
//   - Expressions
//   - expression types (alphabetical)
//   - ...
type Node interface {
        // String representation of the node that returns the given node when parsed
        // as part of a valid query.
        String() string

        // Pretty returns the prettified representation of the node.
        // It uses the level information to determine at which level/depth the current
        // node is in the AST and uses this to apply indentation.
        Pretty(level int) string

        // PositionRange returns the position of the AST Node in the query string.
        PositionRange() posrange.PositionRange
}

// Statement is a generic interface for all statements.
type Statement interface {
        Node

        // PromQLStmt ensures that no other type accidentally implements the interface
        PromQLStmt()
}

// EvalStmt holds an expression and information on the range it should
// be evaluated on.
type EvalStmt struct {
        Expr Expr // Expression to be evaluated.

        // The time boundaries for the evaluation. If Start equals End an instant
        // is evaluated.
        Start, End time.Time
        // Time between two evaluated instants for the range [Start:End].
        Interval time.Duration
        // Lookback delta to use for this evaluation.
        LookbackDelta time.Duration
}

func (*EvalStmt) PromQLStmt() {}

// Expr is a generic interface for all expression types.
type Expr interface {
        Node

        // Type returns the type the expression evaluates to. It does not perform
        // in-depth checks as this is done at parsing-time.
        Type() ValueType
        // PromQLExpr ensures that no other types accidentally implement the interface.
        PromQLExpr()
}

// Expressions is a list of expression nodes that implements Node.
type Expressions []Expr

// AggregateExpr represents an aggregation operation on a Vector.
type AggregateExpr struct {
        Op       ItemType // The used aggregation operation.
        Expr     Expr     // The Vector expression over which is aggregated.
        Param    Expr     // Parameter used by some aggregators.
        Grouping []string // The labels by which to group the Vector.
        Without  bool     // Whether to drop the given labels rather than keep them.
        PosRange posrange.PositionRange
}

// BinaryExpr represents a binary expression between two child expressions.
type BinaryExpr struct {
        Op       ItemType // The operation of the expression.
        LHS, RHS Expr     // The operands on the respective sides of the operator.

        // The matching behavior for the operation if both operands are Vectors.
        // If they are not this field is nil.
        VectorMatching *VectorMatching

        // If a comparison operator, return 0/1 rather than filtering.
        ReturnBool bool
}

// Call represents a function call.
type Call struct {
        Func *Function   // The function that was called.
        Args Expressions // Arguments used in the call.

        PosRange posrange.PositionRange
}

// MatrixSelector represents a Matrix selection.
type MatrixSelector struct {
        // It is safe to assume that this is an VectorSelector
        // if the parser hasn't returned an error.
        VectorSelector Expr
        Range          time.Duration

        EndPos posrange.Pos
}

// SubqueryExpr represents a subquery.
type SubqueryExpr struct {
        Expr  Expr
        Range time.Duration
        // OriginalOffset is the actual offset that was set in the query.
        // This never changes.
        OriginalOffset time.Duration
        // Offset is the offset used during the query execution
        // which is calculated using the original offset, at modifier time,
        // eval time, and subquery offsets in the AST tree.
        Offset     time.Duration
        Timestamp  *int64
        StartOrEnd ItemType // Set when @ is used with start() or end()
        Step       time.Duration

        EndPos posrange.Pos
}

// NumberLiteral represents a number.
type NumberLiteral struct {
        Val float64

        PosRange posrange.PositionRange
}

// ParenExpr wraps an expression so it cannot be disassembled as a consequence
// of operator precedence.
type ParenExpr struct {
        Expr     Expr
        PosRange posrange.PositionRange
}

// StringLiteral represents a string.
type StringLiteral struct {
        Val      string
        PosRange posrange.PositionRange
}

// UnaryExpr represents a unary operation on another expression.
// Currently unary operations are only supported for Scalars.
type UnaryExpr struct {
        Op   ItemType
        Expr Expr

        StartPos posrange.Pos
}

// StepInvariantExpr represents a query which evaluates to the same result
// irrespective of the evaluation time given the raw samples from TSDB remain unchanged.
// Currently this is only used for engine optimisations and the parser does not produce this.
type StepInvariantExpr struct {
        Expr Expr
}

func (e *StepInvariantExpr) String() string { return e.Expr.String() }

func (e *StepInvariantExpr) PositionRange() posrange.PositionRange {
        return e.Expr.PositionRange()
}

// VectorSelector represents a Vector selection.
type VectorSelector struct {
        Name string
        // OriginalOffset is the actual offset that was set in the query.
        // This never changes.
        OriginalOffset time.Duration
        // Offset is the offset used during the query execution
        // which is calculated using the original offset, at modifier time,
        // eval time, and subquery offsets in the AST tree.
        Offset               time.Duration
        Timestamp            *int64
        SkipHistogramBuckets bool     // Set when decoding native histogram buckets is not needed for query evaluation.
        StartOrEnd           ItemType // Set when @ is used with start() or end()
        LabelMatchers        []*labels.Matcher

        // The unexpanded seriesSet populated at query preparation time.
        UnexpandedSeriesSet storage.SeriesSet
        Series              []storage.Series

        PosRange posrange.PositionRange
}

// TestStmt is an internal helper statement that allows execution
// of an arbitrary function during handling. It is used to test the Engine.
type TestStmt func(context.Context) error

func (TestStmt) String() string      { return "test statement" }
func (TestStmt) PromQLStmt()         {}
func (t TestStmt) Pretty(int) string { return t.String() }

func (TestStmt) PositionRange() posrange.PositionRange {
        return posrange.PositionRange{
                Start: -1,
                End:   -1,
        }
}
func (e *AggregateExpr) Type() ValueType  { return ValueTypeVector }
func (e *Call) Type() ValueType           { return e.Func.ReturnType }
func (e *MatrixSelector) Type() ValueType { return ValueTypeMatrix }
func (e *SubqueryExpr) Type() ValueType   { return ValueTypeMatrix }
func (e *NumberLiteral) Type() ValueType  { return ValueTypeScalar }
func (e *ParenExpr) Type() ValueType      { return e.Expr.Type() }
func (e *StringLiteral) Type() ValueType  { return ValueTypeString }
func (e *UnaryExpr) Type() ValueType      { return e.Expr.Type() }
func (e *VectorSelector) Type() ValueType { return ValueTypeVector }
func (e *BinaryExpr) Type() ValueType {
        if e.LHS.Type() == ValueTypeScalar && e.RHS.Type() == ValueTypeScalar {
                return ValueTypeScalar
        }
        return ValueTypeVector
}
func (e *StepInvariantExpr) Type() ValueType { return e.Expr.Type() }

func (*AggregateExpr) PromQLExpr()     {}
func (*BinaryExpr) PromQLExpr()        {}
func (*Call) PromQLExpr()              {}
func (*MatrixSelector) PromQLExpr()    {}
func (*SubqueryExpr) PromQLExpr()      {}
func (*NumberLiteral) PromQLExpr()     {}
func (*ParenExpr) PromQLExpr()         {}
func (*StringLiteral) PromQLExpr()     {}
func (*UnaryExpr) PromQLExpr()         {}
func (*VectorSelector) PromQLExpr()    {}
func (*StepInvariantExpr) PromQLExpr() {}

// VectorMatchCardinality describes the cardinality relationship
// of two Vectors in a binary operation.
type VectorMatchCardinality int

const (
        CardOneToOne VectorMatchCardinality = iota
        CardManyToOne
        CardOneToMany
        CardManyToMany
)

func (vmc VectorMatchCardinality) String() string {
        switch vmc {
        case CardOneToOne:
                return "one-to-one"
        case CardManyToOne:
                return "many-to-one"
        case CardOneToMany:
                return "one-to-many"
        case CardManyToMany:
                return "many-to-many"
        }
        panic("promql.VectorMatchCardinality.String: unknown match cardinality")
}

// VectorMatching describes how elements from two Vectors in a binary
// operation are supposed to be matched.
type VectorMatching struct {
        // The cardinality of the two Vectors.
        Card VectorMatchCardinality
        // MatchingLabels contains the labels which define equality of a pair of
        // elements from the Vectors.
        MatchingLabels []string
        // On includes the given label names from matching,
        // rather than excluding them.
        On bool
        // Include contains additional labels that should be included in
        // the result from the side with the lower cardinality.
        Include []string
}

// Visitor allows visiting a Node and its child nodes. The Visit method is
// invoked for each node with the path leading to the node provided additionally.
// If the result visitor w is not nil and no error, Walk visits each of the children
// of node with the visitor w, followed by a call of w.Visit(nil, nil).
type Visitor interface {
        Visit(node Node, path []Node) (w Visitor, err error)
}

// Walk traverses an AST in depth-first order: It starts by calling
// v.Visit(node, path); node must not be nil. If the visitor w returned by
// v.Visit(node, path) is not nil and the visitor returns no error, Walk is
// invoked recursively with visitor w for each of the non-nil children of node,
// followed by a call of w.Visit(nil), returning an error
// As the tree is descended the path of previous nodes is provided.
func Walk(v Visitor, node Node, path []Node) error {
        var err error
        if v, err = v.Visit(node, path); v == nil || err != nil {
                return err
        }
        path = append(path, node)

        for _, e := range Children(node) {
                if err := Walk(v, e, path); err != nil {
                        return err
                }
        }

        _, err = v.Visit(nil, nil)
        return err
}

func ExtractSelectors(expr Expr) [][]*labels.Matcher {
        var selectors [][]*labels.Matcher
        Inspect(expr, func(node Node, _ []Node) error {
                vs, ok := node.(*VectorSelector)
                if ok {
                        selectors = append(selectors, vs.LabelMatchers)
                }
                return nil
        })
        return selectors
}

type inspector func(Node, []Node) error

func (f inspector) Visit(node Node, path []Node) (Visitor, error) {
        if err := f(node, path); err != nil {
                return nil, err
        }

        return f, nil
}

// Inspect traverses an AST in depth-first order: It starts by calling
// f(node, path); node must not be nil. If f returns a nil error, Inspect invokes f
// for all the non-nil children of node, recursively.
func Inspect(node Node, f inspector) {
        //nolint: errcheck
        Walk(f, node, nil)
}

// Children returns a list of all child nodes of a syntax tree node.
func Children(node Node) []Node {
        // For some reasons these switches have significantly better performance than interfaces
        switch n := node.(type) {
        case *EvalStmt:
                return []Node{n.Expr}
        case Expressions:
                // golang cannot convert slices of interfaces
                ret := make([]Node, len(n))
                for i, e := range n {
                        ret[i] = e
                }
                return ret
        case *AggregateExpr:
                // While this does not look nice, it should avoid unnecessary allocations
                // caused by slice resizing
                switch {
                case n.Expr == nil && n.Param == nil:
                        return nil
                case n.Expr == nil:
                        return []Node{n.Param}
                case n.Param == nil:
                        return []Node{n.Expr}
                default:
                        return []Node{n.Expr, n.Param}
                }
        case *BinaryExpr:
                return []Node{n.LHS, n.RHS}
        case *Call:
                // golang cannot convert slices of interfaces
                ret := make([]Node, len(n.Args))
                for i, e := range n.Args {
                        ret[i] = e
                }
                return ret
        case *SubqueryExpr:
                return []Node{n.Expr}
        case *ParenExpr:
                return []Node{n.Expr}
        case *UnaryExpr:
                return []Node{n.Expr}
        case *MatrixSelector:
                return []Node{n.VectorSelector}
        case *StepInvariantExpr:
                return []Node{n.Expr}
        case *NumberLiteral, *StringLiteral, *VectorSelector:
                // nothing to do
                return []Node{}
        default:
                panic(fmt.Errorf("promql.Children: unhandled node type %T", node))
        }
}

// mergeRanges is a helper function to merge the PositionRanges of two Nodes.
// Note that the arguments must be in the same order as they
// occur in the input string.
func mergeRanges(first, last Node) posrange.PositionRange {
        return posrange.PositionRange{
                Start: first.PositionRange().Start,
                End:   last.PositionRange().End,
        }
}

// Item implements the Node interface.
// This makes it possible to call mergeRanges on them.
func (i *Item) PositionRange() posrange.PositionRange {
        return posrange.PositionRange{
                Start: i.Pos,
                End:   i.Pos + posrange.Pos(len(i.Val)),
        }
}

func (e *AggregateExpr) PositionRange() posrange.PositionRange {
        return e.PosRange
}

func (e *BinaryExpr) PositionRange() posrange.PositionRange {
        return mergeRanges(e.LHS, e.RHS)
}

func (e *Call) PositionRange() posrange.PositionRange {
        return e.PosRange
}

func (e *EvalStmt) PositionRange() posrange.PositionRange {
        return e.Expr.PositionRange()
}

func (e Expressions) PositionRange() posrange.PositionRange {
        if len(e) == 0 {
                // Position undefined.
                return posrange.PositionRange{
                        Start: -1,
                        End:   -1,
                }
        }
        return mergeRanges(e[0], e[len(e)-1])
}

func (e *MatrixSelector) PositionRange() posrange.PositionRange {
        return posrange.PositionRange{
                Start: e.VectorSelector.PositionRange().Start,
                End:   e.EndPos,
        }
}

func (e *SubqueryExpr) PositionRange() posrange.PositionRange {
        return posrange.PositionRange{
                Start: e.Expr.PositionRange().Start,
                End:   e.EndPos,
        }
}

func (e *NumberLiteral) PositionRange() posrange.PositionRange {
        return e.PosRange
}

func (e *ParenExpr) PositionRange() posrange.PositionRange {
        return e.PosRange
}

func (e *StringLiteral) PositionRange() posrange.PositionRange {
        return e.PosRange
}

func (e *UnaryExpr) PositionRange() posrange.PositionRange {
        return posrange.PositionRange{
                Start: e.StartPos,
                End:   e.Expr.PositionRange().End,
        }
}

func (e *VectorSelector) PositionRange() posrange.PositionRange {
        return e.PosRange
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package parser

// Function represents a function of the expression language and is
// used by function nodes.
type Function struct {
        Name         string
        ArgTypes     []ValueType
        Variadic     int
        ReturnType   ValueType
        Experimental bool
}

// EnableExperimentalFunctions controls whether experimentalFunctions are enabled.
var EnableExperimentalFunctions bool

// Functions is a list of all functions supported by PromQL, including their types.
var Functions = map[string]*Function{
        "abs": {
                Name:       "abs",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "absent": {
                Name:       "absent",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "absent_over_time": {
                Name:       "absent_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "acos": {
                Name:       "acos",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "acosh": {
                Name:       "acosh",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "asin": {
                Name:       "asin",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "asinh": {
                Name:       "asinh",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "atan": {
                Name:       "atan",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "atanh": {
                Name:       "atanh",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "avg_over_time": {
                Name:       "avg_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "ceil": {
                Name:       "ceil",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "changes": {
                Name:       "changes",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "clamp": {
                Name:       "clamp",
                ArgTypes:   []ValueType{ValueTypeVector, ValueTypeScalar, ValueTypeScalar},
                ReturnType: ValueTypeVector,
        },
        "clamp_max": {
                Name:       "clamp_max",
                ArgTypes:   []ValueType{ValueTypeVector, ValueTypeScalar},
                ReturnType: ValueTypeVector,
        },
        "clamp_min": {
                Name:       "clamp_min",
                ArgTypes:   []ValueType{ValueTypeVector, ValueTypeScalar},
                ReturnType: ValueTypeVector,
        },
        "cos": {
                Name:       "cos",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "cosh": {
                Name:       "cosh",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "count_over_time": {
                Name:       "count_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "days_in_month": {
                Name:       "days_in_month",
                ArgTypes:   []ValueType{ValueTypeVector},
                Variadic:   1,
                ReturnType: ValueTypeVector,
        },
        "day_of_month": {
                Name:       "day_of_month",
                ArgTypes:   []ValueType{ValueTypeVector},
                Variadic:   1,
                ReturnType: ValueTypeVector,
        },
        "day_of_week": {
                Name:       "day_of_week",
                ArgTypes:   []ValueType{ValueTypeVector},
                Variadic:   1,
                ReturnType: ValueTypeVector,
        },
        "day_of_year": {
                Name:       "day_of_year",
                ArgTypes:   []ValueType{ValueTypeVector},
                Variadic:   1,
                ReturnType: ValueTypeVector,
        },
        "deg": {
                Name:       "deg",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "delta": {
                Name:       "delta",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "deriv": {
                Name:       "deriv",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "exp": {
                Name:       "exp",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "floor": {
                Name:       "floor",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "histogram_avg": {
                Name:       "histogram_avg",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "histogram_count": {
                Name:       "histogram_count",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "histogram_sum": {
                Name:       "histogram_sum",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "histogram_stddev": {
                Name:       "histogram_stddev",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "histogram_stdvar": {
                Name:       "histogram_stdvar",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "histogram_fraction": {
                Name:       "histogram_fraction",
                ArgTypes:   []ValueType{ValueTypeScalar, ValueTypeScalar, ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "histogram_quantile": {
                Name:       "histogram_quantile",
                ArgTypes:   []ValueType{ValueTypeScalar, ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "holt_winters": {
                Name:       "holt_winters",
                ArgTypes:   []ValueType{ValueTypeMatrix, ValueTypeScalar, ValueTypeScalar},
                ReturnType: ValueTypeVector,
        },
        "hour": {
                Name:       "hour",
                ArgTypes:   []ValueType{ValueTypeVector},
                Variadic:   1,
                ReturnType: ValueTypeVector,
        },
        "idelta": {
                Name:       "idelta",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "increase": {
                Name:       "increase",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "irate": {
                Name:       "irate",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "label_replace": {
                Name:       "label_replace",
                ArgTypes:   []ValueType{ValueTypeVector, ValueTypeString, ValueTypeString, ValueTypeString, ValueTypeString},
                ReturnType: ValueTypeVector,
        },
        "label_join": {
                Name:       "label_join",
                ArgTypes:   []ValueType{ValueTypeVector, ValueTypeString, ValueTypeString, ValueTypeString},
                Variadic:   -1,
                ReturnType: ValueTypeVector,
        },
        "last_over_time": {
                Name:       "last_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "ln": {
                Name:       "ln",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "log10": {
                Name:       "log10",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "log2": {
                Name:       "log2",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "mad_over_time": {
                Name:         "mad_over_time",
                ArgTypes:     []ValueType{ValueTypeMatrix},
                ReturnType:   ValueTypeVector,
                Experimental: true,
        },
        "max_over_time": {
                Name:       "max_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "min_over_time": {
                Name:       "min_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "minute": {
                Name:       "minute",
                ArgTypes:   []ValueType{ValueTypeVector},
                Variadic:   1,
                ReturnType: ValueTypeVector,
        },
        "month": {
                Name:       "month",
                ArgTypes:   []ValueType{ValueTypeVector},
                Variadic:   1,
                ReturnType: ValueTypeVector,
        },
        "pi": {
                Name:       "pi",
                ArgTypes:   []ValueType{},
                ReturnType: ValueTypeScalar,
        },
        "predict_linear": {
                Name:       "predict_linear",
                ArgTypes:   []ValueType{ValueTypeMatrix, ValueTypeScalar},
                ReturnType: ValueTypeVector,
        },
        "present_over_time": {
                Name:       "present_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "quantile_over_time": {
                Name:       "quantile_over_time",
                ArgTypes:   []ValueType{ValueTypeScalar, ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "rad": {
                Name:       "rad",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "rate": {
                Name:       "rate",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "resets": {
                Name:       "resets",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "round": {
                Name:       "round",
                ArgTypes:   []ValueType{ValueTypeVector, ValueTypeScalar},
                Variadic:   1,
                ReturnType: ValueTypeVector,
        },
        "scalar": {
                Name:       "scalar",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeScalar,
        },
        "sgn": {
                Name:       "sgn",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "sin": {
                Name:       "sin",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "sinh": {
                Name:       "sinh",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "sort": {
                Name:       "sort",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "sort_desc": {
                Name:       "sort_desc",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "sort_by_label": {
                Name:         "sort_by_label",
                ArgTypes:     []ValueType{ValueTypeVector, ValueTypeString},
                Variadic:     -1,
                ReturnType:   ValueTypeVector,
                Experimental: true,
        },
        "sort_by_label_desc": {
                Name:         "sort_by_label_desc",
                ArgTypes:     []ValueType{ValueTypeVector, ValueTypeString},
                Variadic:     -1,
                ReturnType:   ValueTypeVector,
                Experimental: true,
        },
        "sqrt": {
                Name:       "sqrt",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "stddev_over_time": {
                Name:       "stddev_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "stdvar_over_time": {
                Name:       "stdvar_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "sum_over_time": {
                Name:       "sum_over_time",
                ArgTypes:   []ValueType{ValueTypeMatrix},
                ReturnType: ValueTypeVector,
        },
        "tan": {
                Name:       "tan",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "tanh": {
                Name:       "tanh",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "time": {
                Name:       "time",
                ArgTypes:   []ValueType{},
                ReturnType: ValueTypeScalar,
        },
        "timestamp": {
                Name:       "timestamp",
                ArgTypes:   []ValueType{ValueTypeVector},
                ReturnType: ValueTypeVector,
        },
        "vector": {
                Name:       "vector",
                ArgTypes:   []ValueType{ValueTypeScalar},
                ReturnType: ValueTypeVector,
        },
        "year": {
                Name:       "year",
                ArgTypes:   []ValueType{ValueTypeVector},
                Variadic:   1,
                ReturnType: ValueTypeVector,
        },
}

// getFunction returns a predefined Function object for the given name.
func getFunction(name string, functions map[string]*Function) (*Function, bool) {
        function, ok := functions[name]
        return function, ok
}

// Code generated by goyacc -l -o promql/parser/generated_parser.y.go promql/parser/generated_parser.y. DO NOT EDIT.
package parser

import __yyfmt__ "fmt"

import (
        "math"
        "strconv"
        "time"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/value"
        "github.com/prometheus/prometheus/promql/parser/posrange"
)

type yySymType struct {
        yys         int
        node        Node
        item        Item
        matchers    []*labels.Matcher
        matcher     *labels.Matcher
        label       labels.Label
        labels      labels.Labels
        lblList     []labels.Label
        strings     []string
        series      []SequenceValue
        histogram   *histogram.FloatHistogram
        descriptors map[string]interface{}
        bucket_set  []float64
        int         int64
        uint        uint64
        float       float64
        duration    time.Duration
}

const EQL = 57346
const BLANK = 57347
const COLON = 57348
const COMMA = 57349
const COMMENT = 57350
const DURATION = 57351
const EOF = 57352
const ERROR = 57353
const IDENTIFIER = 57354
const LEFT_BRACE = 57355
const LEFT_BRACKET = 57356
const LEFT_PAREN = 57357
const OPEN_HIST = 57358
const CLOSE_HIST = 57359
const METRIC_IDENTIFIER = 57360
const NUMBER = 57361
const RIGHT_BRACE = 57362
const RIGHT_BRACKET = 57363
const RIGHT_PAREN = 57364
const SEMICOLON = 57365
const SPACE = 57366
const STRING = 57367
const TIMES = 57368
const histogramDescStart = 57369
const SUM_DESC = 57370
const COUNT_DESC = 57371
const SCHEMA_DESC = 57372
const OFFSET_DESC = 57373
const NEGATIVE_OFFSET_DESC = 57374
const BUCKETS_DESC = 57375
const NEGATIVE_BUCKETS_DESC = 57376
const ZERO_BUCKET_DESC = 57377
const ZERO_BUCKET_WIDTH_DESC = 57378
const CUSTOM_VALUES_DESC = 57379
const histogramDescEnd = 57380
const operatorsStart = 57381
const ADD = 57382
const DIV = 57383
const EQLC = 57384
const EQL_REGEX = 57385
const GTE = 57386
const GTR = 57387
const LAND = 57388
const LOR = 57389
const LSS = 57390
const LTE = 57391
const LUNLESS = 57392
const MOD = 57393
const MUL = 57394
const NEQ = 57395
const NEQ_REGEX = 57396
const POW = 57397
const SUB = 57398
const AT = 57399
const ATAN2 = 57400
const operatorsEnd = 57401
const aggregatorsStart = 57402
const AVG = 57403
const BOTTOMK = 57404
const COUNT = 57405
const COUNT_VALUES = 57406
const GROUP = 57407
const MAX = 57408
const MIN = 57409
const QUANTILE = 57410
const STDDEV = 57411
const STDVAR = 57412
const SUM = 57413
const TOPK = 57414
const aggregatorsEnd = 57415
const keywordsStart = 57416
const BOOL = 57417
const BY = 57418
const GROUP_LEFT = 57419
const GROUP_RIGHT = 57420
const IGNORING = 57421
const OFFSET = 57422
const ON = 57423
const WITHOUT = 57424
const keywordsEnd = 57425
const preprocessorStart = 57426
const START = 57427
const END = 57428
const preprocessorEnd = 57429
const startSymbolsStart = 57430
const START_METRIC = 57431
const START_SERIES_DESCRIPTION = 57432
const START_EXPRESSION = 57433
const START_METRIC_SELECTOR = 57434
const startSymbolsEnd = 57435

var yyToknames = [...]string{
        "$end",
        "error",
        "$unk",
        "EQL",
        "BLANK",
        "COLON",
        "COMMA",
        "COMMENT",
        "DURATION",
        "EOF",
        "ERROR",
        "IDENTIFIER",
        "LEFT_BRACE",
        "LEFT_BRACKET",
        "LEFT_PAREN",
        "OPEN_HIST",
        "CLOSE_HIST",
        "METRIC_IDENTIFIER",
        "NUMBER",
        "RIGHT_BRACE",
        "RIGHT_BRACKET",
        "RIGHT_PAREN",
        "SEMICOLON",
        "SPACE",
        "STRING",
        "TIMES",
        "histogramDescStart",
        "SUM_DESC",
        "COUNT_DESC",
        "SCHEMA_DESC",
        "OFFSET_DESC",
        "NEGATIVE_OFFSET_DESC",
        "BUCKETS_DESC",
        "NEGATIVE_BUCKETS_DESC",
        "ZERO_BUCKET_DESC",
        "ZERO_BUCKET_WIDTH_DESC",
        "CUSTOM_VALUES_DESC",
        "histogramDescEnd",
        "operatorsStart",
        "ADD",
        "DIV",
        "EQLC",
        "EQL_REGEX",
        "GTE",
        "GTR",
        "LAND",
        "LOR",
        "LSS",
        "LTE",
        "LUNLESS",
        "MOD",
        "MUL",
        "NEQ",
        "NEQ_REGEX",
        "POW",
        "SUB",
        "AT",
        "ATAN2",
        "operatorsEnd",
        "aggregatorsStart",
        "AVG",
        "BOTTOMK",
        "COUNT",
        "COUNT_VALUES",
        "GROUP",
        "MAX",
        "MIN",
        "QUANTILE",
        "STDDEV",
        "STDVAR",
        "SUM",
        "TOPK",
        "aggregatorsEnd",
        "keywordsStart",
        "BOOL",
        "BY",
        "GROUP_LEFT",
        "GROUP_RIGHT",
        "IGNORING",
        "OFFSET",
        "ON",
        "WITHOUT",
        "keywordsEnd",
        "preprocessorStart",
        "START",
        "END",
        "preprocessorEnd",
        "startSymbolsStart",
        "START_METRIC",
        "START_SERIES_DESCRIPTION",
        "START_EXPRESSION",
        "START_METRIC_SELECTOR",
        "startSymbolsEnd",
}

var yyStatenames = [...]string{}

const yyEofCode = 1
const yyErrCode = 2
const yyInitialStackSize = 16

var yyExca = [...]int16{
        -1, 1,
        1, -1,
        -2, 0,
        -1, 35,
        1, 134,
        10, 134,
        24, 134,
        -2, 0,
        -1, 58,
        2, 172,
        15, 172,
        76, 172,
        82, 172,
        -2, 100,
        -1, 59,
        2, 173,
        15, 173,
        76, 173,
        82, 173,
        -2, 101,
        -1, 60,
        2, 174,
        15, 174,
        76, 174,
        82, 174,
        -2, 103,
        -1, 61,
        2, 175,
        15, 175,
        76, 175,
        82, 175,
        -2, 104,
        -1, 62,
        2, 176,
        15, 176,
        76, 176,
        82, 176,
        -2, 105,
        -1, 63,
        2, 177,
        15, 177,
        76, 177,
        82, 177,
        -2, 110,
        -1, 64,
        2, 178,
        15, 178,
        76, 178,
        82, 178,
        -2, 112,
        -1, 65,
        2, 179,
        15, 179,
        76, 179,
        82, 179,
        -2, 114,
        -1, 66,
        2, 180,
        15, 180,
        76, 180,
        82, 180,
        -2, 115,
        -1, 67,
        2, 181,
        15, 181,
        76, 181,
        82, 181,
        -2, 116,
        -1, 68,
        2, 182,
        15, 182,
        76, 182,
        82, 182,
        -2, 117,
        -1, 69,
        2, 183,
        15, 183,
        76, 183,
        82, 183,
        -2, 118,
        -1, 195,
        12, 231,
        13, 231,
        18, 231,
        19, 231,
        25, 231,
        40, 231,
        46, 231,
        47, 231,
        50, 231,
        56, 231,
        61, 231,
        62, 231,
        63, 231,
        64, 231,
        65, 231,
        66, 231,
        67, 231,
        68, 231,
        69, 231,
        70, 231,
        71, 231,
        72, 231,
        76, 231,
        80, 231,
        82, 231,
        85, 231,
        86, 231,
        -2, 0,
        -1, 196,
        12, 231,
        13, 231,
        18, 231,
        19, 231,
        25, 231,
        40, 231,
        46, 231,
        47, 231,
        50, 231,
        56, 231,
        61, 231,
        62, 231,
        63, 231,
        64, 231,
        65, 231,
        66, 231,
        67, 231,
        68, 231,
        69, 231,
        70, 231,
        71, 231,
        72, 231,
        76, 231,
        80, 231,
        82, 231,
        85, 231,
        86, 231,
        -2, 0,
        -1, 217,
        21, 229,
        -2, 0,
        -1, 286,
        21, 230,
        -2, 0,
}

const yyPrivate = 57344

const yyLast = 778

var yyAct = [...]int16{
        151, 324, 322, 268, 329, 148, 221, 37, 187, 144,
        282, 281, 152, 113, 77, 173, 104, 102, 101, 6,
        223, 193, 105, 194, 195, 196, 128, 262, 260, 155,
        233, 103, 342, 293, 100, 319, 239, 116, 146, 318,
        315, 263, 156, 123, 106, 147, 284, 114, 295, 116,
        156, 341, 175, 259, 340, 253, 57, 264, 157, 114,
        117, 108, 313, 109, 235, 236, 157, 112, 237, 107,
        323, 174, 117, 175, 155, 96, 250, 99, 293, 224,
        226, 228, 229, 230, 238, 240, 243, 244, 245, 246,
        247, 177, 145, 225, 227, 231, 232, 234, 241, 242,
        98, 176, 178, 248, 249, 104, 2, 3, 4, 5,
        158, 105, 177, 110, 168, 162, 165, 302, 150, 160,
        191, 161, 176, 178, 189, 155, 213, 343, 106, 330,
        72, 179, 192, 33, 181, 155, 190, 197, 198, 199,
        200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
        210, 211, 185, 301, 258, 212, 156, 214, 215, 188,
        256, 183, 290, 191, 252, 164, 155, 289, 300, 218,
        223, 79, 157, 217, 7, 299, 312, 257, 163, 251,
        233, 78, 288, 255, 182, 254, 239, 156, 216, 180,
        220, 124, 172, 120, 147, 311, 314, 171, 119, 261,
        287, 153, 154, 157, 279, 280, 79, 147, 283, 310,
        170, 118, 159, 10, 235, 236, 78, 309, 237, 147,
        308, 307, 306, 74, 76, 305, 250, 286, 304, 224,
        226, 228, 229, 230, 238, 240, 243, 244, 245, 246,
        247, 303, 81, 225, 227, 231, 232, 234, 241, 242,
        48, 34, 1, 248, 249, 122, 73, 121, 285, 47,
        291, 292, 294, 56, 296, 8, 9, 9, 46, 35,
        45, 44, 297, 298, 127, 129, 130, 131, 132, 133,
        134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        43, 42, 41, 125, 166, 40, 316, 317, 126, 39,
        38, 49, 186, 321, 338, 265, 326, 327, 328, 80,
        325, 184, 219, 332, 331, 334, 333, 75, 115, 149,
        335, 336, 100, 51, 72, 337, 53, 55, 222, 22,
        52, 339, 50, 167, 111, 0, 54, 0, 0, 0,
        0, 344, 0, 0, 0, 0, 0, 0, 82, 84,
        0, 70, 0, 0, 0, 0, 0, 18, 19, 93,
        94, 20, 0, 96, 97, 99, 83, 71, 0, 0,
        0, 0, 58, 59, 60, 61, 62, 63, 64, 65,
        66, 67, 68, 69, 0, 0, 0, 13, 98, 0,
        0, 24, 0, 30, 0, 0, 31, 32, 36, 100,
        51, 72, 0, 53, 267, 0, 22, 52, 0, 0,
        0, 266, 0, 54, 0, 270, 271, 269, 276, 278,
        275, 277, 272, 273, 274, 0, 84, 0, 70, 0,
        0, 0, 0, 0, 18, 19, 93, 94, 20, 0,
        96, 0, 99, 83, 71, 0, 0, 0, 0, 58,
        59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
        69, 0, 0, 0, 13, 98, 0, 0, 24, 0,
        30, 0, 0, 31, 32, 51, 72, 0, 53, 320,
        0, 22, 52, 0, 0, 0, 0, 0, 54, 0,
        270, 271, 269, 276, 278, 275, 277, 272, 273, 274,
        0, 0, 0, 70, 0, 0, 17, 72, 0, 18,
        19, 0, 22, 20, 0, 0, 0, 0, 0, 71,
        0, 0, 0, 0, 58, 59, 60, 61, 62, 63,
        64, 65, 66, 67, 68, 69, 0, 0, 0, 13,
        18, 19, 0, 24, 20, 30, 0, 0, 31, 32,
        0, 0, 0, 0, 0, 11, 12, 14, 15, 16,
        21, 23, 25, 26, 27, 28, 29, 17, 33, 0,
        13, 0, 0, 22, 24, 0, 30, 0, 0, 31,
        32, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 18, 19, 0, 0, 20, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 11, 12, 14, 15,
        16, 21, 23, 25, 26, 27, 28, 29, 100, 0,
        0, 13, 0, 0, 0, 24, 169, 30, 0, 0,
        31, 32, 0, 0, 0, 0, 0, 100, 0, 0,
        0, 0, 0, 0, 82, 84, 85, 0, 86, 87,
        88, 89, 90, 91, 92, 93, 94, 95, 0, 96,
        97, 99, 83, 82, 84, 85, 0, 86, 87, 88,
        89, 90, 91, 92, 93, 94, 95, 0, 96, 97,
        99, 83, 100, 0, 98, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 100, 0, 98, 0, 0, 0, 0, 82, 84,
        85, 0, 86, 87, 88, 0, 90, 91, 92, 93,
        94, 95, 0, 96, 97, 99, 83, 82, 84, 85,
        0, 86, 87, 0, 0, 90, 91, 0, 93, 94,
        95, 0, 96, 97, 99, 83, 0, 0, 98, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 98,
}

var yyPact = [...]int16{
        17, 164, 555, 555, 388, 494, -1000, -1000, -1000, 120,
        -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000,
        -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000,
        -1000, -1000, -1000, 204, -1000, 240, -1000, 633, -1000, -1000,
        -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000,
        29, 113, -1000, 463, -1000, 463, 117, -1000, -1000, -1000,
        -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000,
        -1000, -1000, 47, -1000, -1000, 191, -1000, -1000, 253, -1000,
        19, -1000, -49, -49, -49, -49, -49, -49, -49, -49,
        -49, -49, -49, -49, -49, -49, -49, -49, 36, 116,
        210, 113, -60, -1000, 163, 163, 311, -1000, 614, 20,
        -1000, 190, -1000, -1000, 69, 48, -1000, -1000, -1000, 169,
        -1000, 159, -1000, 147, 463, -1000, -58, -53, -1000, 463,
        463, 463, 463, 463, 463, 463, 463, 463, 463, 463,
        463, 463, 463, 463, -1000, 185, -1000, -1000, -1000, 111,
        -1000, -1000, -1000, -1000, -1000, -1000, 55, 55, 167, -1000,
        -1000, -1000, -1000, 168, -1000, -1000, 157, -1000, 633, -1000,
        -1000, 35, -1000, 158, -1000, -1000, -1000, -1000, -1000, 152,
        -1000, -1000, -1000, -1000, -1000, 27, 2, 1, -1000, -1000,
        -1000, 387, 385, 163, 163, 163, 163, 20, 20, 308,
        308, 308, 697, 678, 308, 308, 697, 20, 20, 308,
        20, 385, -1000, 24, -1000, -1000, -1000, 198, -1000, 160,
        -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000,
        -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000,
        -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000,
        -1000, -1000, 463, -1000, -1000, -1000, -1000, -1000, -1000, 59,
        59, 22, 59, 104, 104, 151, 100, -1000, -1000, 235,
        222, 219, 216, 215, 214, 211, 203, 189, 170, -1000,
        -1000, -1000, -1000, -1000, -1000, 41, 194, -1000, -1000, 18,
        -1000, 633, -1000, -1000, -1000, 59, -1000, 13, 9, 462,
        -1000, -1000, -1000, 14, 10, 55, 55, 55, 115, 115,
        14, 115, 14, -1000, -1000, -1000, -1000, -1000, 59, 59,
        -1000, -1000, -1000, 59, -1000, -1000, -1000, -1000, -1000, -1000,
        55, -1000, -1000, -1000, -1000, -1000, -1000, -1000, 30, -1000,
        106, -1000, -1000, -1000, -1000,
}

var yyPgo = [...]int16{
        0, 334, 13, 332, 6, 15, 328, 263, 327, 319,
        318, 213, 265, 317, 14, 312, 10, 11, 311, 309,
        8, 305, 3, 4, 304, 2, 1, 0, 302, 12,
        5, 301, 300, 18, 191, 299, 298, 7, 295, 294,
        17, 293, 56, 292, 291, 290, 274, 271, 270, 268,
        259, 250, 9, 258, 252, 251,
}

var yyR1 = [...]int8{
        0, 54, 54, 54, 54, 54, 54, 54, 37, 37,
        37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
        32, 32, 32, 32, 33, 33, 35, 35, 35, 35,
        35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
        35, 35, 34, 36, 36, 46, 46, 41, 41, 41,
        41, 16, 16, 16, 16, 15, 15, 15, 4, 4,
        38, 40, 40, 39, 39, 39, 47, 45, 45, 45,
        31, 31, 31, 9, 9, 43, 49, 49, 49, 49,
        49, 50, 51, 51, 51, 42, 42, 42, 1, 1,
        1, 2, 2, 2, 2, 2, 2, 2, 12, 12,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 11, 11, 11, 11, 13, 13, 13, 14,
        14, 14, 14, 55, 19, 19, 19, 19, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 28, 28, 28,
        20, 20, 20, 20, 21, 21, 21, 22, 22, 22,
        22, 22, 22, 22, 22, 22, 22, 23, 23, 24,
        24, 24, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 8, 8, 5, 5, 5, 5, 44, 27, 29,
        29, 30, 30, 26, 25, 25, 52, 48, 10, 53,
        53, 17, 17,
}

var yyR2 = [...]int8{
        0, 2, 2, 2, 2, 2, 2, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        3, 3, 2, 2, 2, 2, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 1, 0, 1, 3, 3, 1, 1, 3,
        3, 3, 4, 2, 1, 3, 1, 2, 1, 1,
        2, 3, 2, 3, 1, 2, 3, 3, 4, 3,
        3, 5, 3, 1, 1, 4, 6, 6, 5, 4,
        3, 2, 2, 1, 1, 3, 4, 2, 3, 1,
        2, 3, 3, 1, 3, 3, 2, 1, 2, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 3, 4, 2, 0, 3, 1, 2, 3,
        3, 2, 1, 2, 0, 3, 2, 1, 1, 3,
        1, 3, 4, 1, 3, 5, 5, 1, 1, 1,
        4, 3, 3, 2, 3, 1, 2, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
        1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
        2, 1, 1, 1, 2, 1, 1, 1, 1, 0,
        1, 0, 1,
}

var yyChk = [...]int16{
        -1000, -54, 89, 90, 91, 92, 2, 10, -12, -7,
        -11, 61, 62, 76, 63, 64, 65, 12, 46, 47,
        50, 66, 18, 67, 80, 68, 69, 70, 71, 72,
        82, 85, 86, 13, -55, -12, 10, -37, -32, -35,
        -38, -43, -44, -45, -47, -48, -49, -50, -51, -31,
        -3, 12, 19, 15, 25, -8, -7, -42, 61, 62,
        63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
        40, 56, 13, -51, -11, -13, 20, -14, 12, 2,
        -19, 2, 40, 58, 41, 42, 44, 45, 46, 47,
        48, 49, 50, 51, 52, 53, 55, 56, 80, 57,
        14, -33, -40, 2, 76, 82, 15, -40, -37, -37,
        -42, -1, 20, -2, 12, -10, 2, 25, 20, 7,
        2, 4, 2, 24, -34, -41, -36, -46, 75, -34,
        -34, -34, -34, -34, -34, -34, -34, -34, -34, -34,
        -34, -34, -34, -34, -52, 56, 2, 9, -30, -9,
        2, -27, -29, 85, 86, 19, 40, 56, -52, 2,
        -40, -33, -16, 15, 2, -16, -39, 22, -37, 22,
        20, 7, 2, -5, 2, 4, 53, 43, 54, -5,
        20, -14, 25, 2, -18, 5, -28, -20, 12, -27,
        -29, 16, -37, 79, 81, 77, 78, -37, -37, -37,
        -37, -37, -37, -37, -37, -37, -37, -37, -37, -37,
        -37, -37, -52, 15, -27, -27, 21, 6, 2, -15,
        22, -4, -6, 2, 61, 75, 62, 76, 63, 64,
        65, 77, 78, 12, 79, 46, 47, 50, 66, 18,
        67, 80, 81, 68, 69, 70, 71, 72, 85, 86,
        58, 22, 7, 20, -2, 25, 2, 25, 2, 26,
        26, -29, 26, 40, 56, -21, 24, 17, -22, 30,
        28, 29, 35, 36, 37, 33, 31, 34, 32, -16,
        -16, -17, -16, -17, 22, -53, -52, 2, 22, 7,
        2, -37, -26, 19, -26, 26, -26, -20, -20, 24,
        17, 2, 17, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 21, 2, 22, -4, -26, 26, 26,
        17, -22, -25, 56, -26, -30, -27, -27, -27, -23,
        14, -23, -25, -23, -25, -26, -26, -26, -24, -27,
        24, 21, 2, 21, -27,
}

var yyDef = [...]int16{
        0, -2, 125, 125, 0, 0, 7, 6, 1, 125,
        99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
        109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
        119, 120, 121, 0, 2, -2, 3, 4, 8, 9,
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        0, 106, 217, 0, 227, 0, 83, 84, -2, -2,
        -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
        211, 212, 0, 5, 98, 0, 124, 127, 0, 132,
        133, 137, 43, 43, 43, 43, 43, 43, 43, 43,
        43, 43, 43, 43, 43, 43, 43, 43, 0, 0,
        0, 0, 22, 23, 0, 0, 0, 60, 0, 81,
        82, 0, 87, 89, 0, 93, 97, 228, 122, 0,
        128, 0, 131, 136, 0, 42, 47, 48, 44, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 67, 0, 69, 226, 70, 0,
        72, 221, 222, 73, 74, 218, 0, 0, 0, 80,
        20, 21, 24, 0, 54, 25, 0, 62, 64, 66,
        85, 0, 90, 0, 96, 213, 214, 215, 216, 0,
        123, 126, 129, 130, 135, 138, 140, 143, 147, 148,
        149, 0, 26, 0, 0, -2, -2, 27, 28, 29,
        30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
        40, 41, 68, 0, 219, 220, 75, -2, 79, 0,
        53, 56, 58, 59, 184, 185, 186, 187, 188, 189,
        190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
        200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
        210, 61, 65, 86, 88, 91, 95, 92, 94, 0,
        0, 0, 0, 0, 0, 0, 0, 153, 155, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 45,
        46, 49, 232, 50, 71, 0, -2, 78, 51, 0,
        57, 63, 139, 223, 141, 0, 144, 0, 0, 0,
        151, 156, 152, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 76, 77, 52, 55, 142, 0, 0,
        150, 154, 157, 0, 225, 158, 159, 160, 161, 162,
        0, 163, 164, 165, 166, 145, 146, 224, 0, 170,
        0, 168, 171, 167, 169,
}

var yyTok1 = [...]int8{
        1,
}

var yyTok2 = [...]int8{
        2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
        12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
        22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
        32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
        42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
        52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
        62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
        72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
        82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
        92, 93,
}

var yyTok3 = [...]int8{
        0,
}

var yyErrorMessages = [...]struct {
        state int
        token int
        msg   string
}{}

/*        parser for yacc output        */

var (
        yyDebug        = 0
        yyErrorVerbose = false
)

type yyLexer interface {
        Lex(lval *yySymType) int
        Error(s string)
}

type yyParser interface {
        Parse(yyLexer) int
        Lookahead() int
}

type yyParserImpl struct {
        lval  yySymType
        stack [yyInitialStackSize]yySymType
        char  int
}

func (p *yyParserImpl) Lookahead() int {
        return p.char
}

func yyNewParser() yyParser {
        return &yyParserImpl{}
}

const yyFlag = -1000

func yyTokname(c int) string {
        if c >= 1 && c-1 < len(yyToknames) {
                if yyToknames[c-1] != "" {
                        return yyToknames[c-1]
                }
        }
        return __yyfmt__.Sprintf("tok-%v", c)
}

func yyStatname(s int) string {
        if s >= 0 && s < len(yyStatenames) {
                if yyStatenames[s] != "" {
                        return yyStatenames[s]
                }
        }
        return __yyfmt__.Sprintf("state-%v", s)
}

func yyErrorMessage(state, lookAhead int) string {
        const TOKSTART = 4

        if !yyErrorVerbose {
                return "syntax error"
        }

        for _, e := range yyErrorMessages {
                if e.state == state && e.token == lookAhead {
                        return "syntax error: " + e.msg
                }
        }

        res := "syntax error: unexpected " + yyTokname(lookAhead)

        // To match Bison, suggest at most four expected tokens.
        expected := make([]int, 0, 4)

        // Look for shiftable tokens.
        base := int(yyPact[state])
        for tok := TOKSTART; tok-1 < len(yyToknames); tok++ {
                if n := base + tok; n >= 0 && n < yyLast && int(yyChk[int(yyAct[n])]) == tok {
                        if len(expected) == cap(expected) {
                                return res
                        }
                        expected = append(expected, tok)
                }
        }

        if yyDef[state] == -2 {
                i := 0
                for yyExca[i] != -1 || int(yyExca[i+1]) != state {
                        i += 2
                }

                // Look for tokens that we accept or reduce.
                for i += 2; yyExca[i] >= 0; i += 2 {
                        tok := int(yyExca[i])
                        if tok < TOKSTART || yyExca[i+1] == 0 {
                                continue
                        }
                        if len(expected) == cap(expected) {
                                return res
                        }
                        expected = append(expected, tok)
                }

                // If the default action is to accept or reduce, give up.
                if yyExca[i+1] != 0 {
                        return res
                }
        }

        for i, tok := range expected {
                if i == 0 {
                        res += ", expecting "
                } else {
                        res += " or "
                }
                res += yyTokname(tok)
        }
        return res
}

func yylex1(lex yyLexer, lval *yySymType) (char, token int) {
        token = 0
        char = lex.Lex(lval)
        if char <= 0 {
                token = int(yyTok1[0])
                goto out
        }
        if char < len(yyTok1) {
                token = int(yyTok1[char])
                goto out
        }
        if char >= yyPrivate {
                if char < yyPrivate+len(yyTok2) {
                        token = int(yyTok2[char-yyPrivate])
                        goto out
                }
        }
        for i := 0; i < len(yyTok3); i += 2 {
                token = int(yyTok3[i+0])
                if token == char {
                        token = int(yyTok3[i+1])
                        goto out
                }
        }

out:
        if token == 0 {
                token = int(yyTok2[1]) /* unknown char */
        }
        if yyDebug >= 3 {
                __yyfmt__.Printf("lex %s(%d)\n", yyTokname(token), uint(char))
        }
        return char, token
}

func yyParse(yylex yyLexer) int {
        return yyNewParser().Parse(yylex)
}

func (yyrcvr *yyParserImpl) Parse(yylex yyLexer) int {
        var yyn int
        var yyVAL yySymType
        var yyDollar []yySymType
        _ = yyDollar // silence set and not used
        yyS := yyrcvr.stack[:]

        Nerrs := 0   /* number of errors */
        Errflag := 0 /* error recovery flag */
        yystate := 0
        yyrcvr.char = -1
        yytoken := -1 // yyrcvr.char translated into internal numbering
        defer func() {
                // Make sure we report no lookahead when not parsing.
                yystate = -1
                yyrcvr.char = -1
                yytoken = -1
        }()
        yyp := -1
        goto yystack

ret0:
        return 0

ret1:
        return 1

yystack:
        /* put a state and value onto the stack */
        if yyDebug >= 4 {
                __yyfmt__.Printf("char %v in %v\n", yyTokname(yytoken), yyStatname(yystate))
        }

        yyp++
        if yyp >= len(yyS) {
                nyys := make([]yySymType, len(yyS)*2)
                copy(nyys, yyS)
                yyS = nyys
        }
        yyS[yyp] = yyVAL
        yyS[yyp].yys = yystate

yynewstate:
        yyn = int(yyPact[yystate])
        if yyn <= yyFlag {
                goto yydefault /* simple state */
        }
        if yyrcvr.char < 0 {
                yyrcvr.char, yytoken = yylex1(yylex, &yyrcvr.lval)
        }
        yyn += yytoken
        if yyn < 0 || yyn >= yyLast {
                goto yydefault
        }
        yyn = int(yyAct[yyn])
        if int(yyChk[yyn]) == yytoken { /* valid shift */
                yyrcvr.char = -1
                yytoken = -1
                yyVAL = yyrcvr.lval
                yystate = yyn
                if Errflag > 0 {
                        Errflag--
                }
                goto yystack
        }

yydefault:
        /* default state action */
        yyn = int(yyDef[yystate])
        if yyn == -2 {
                if yyrcvr.char < 0 {
                        yyrcvr.char, yytoken = yylex1(yylex, &yyrcvr.lval)
                }

                /* look through exception table */
                xi := 0
                for {
                        if yyExca[xi+0] == -1 && int(yyExca[xi+1]) == yystate {
                                break
                        }
                        xi += 2
                }
                for xi += 2; ; xi += 2 {
                        yyn = int(yyExca[xi+0])
                        if yyn < 0 || yyn == yytoken {
                                break
                        }
                }
                yyn = int(yyExca[xi+1])
                if yyn < 0 {
                        goto ret0
                }
        }
        if yyn == 0 {
                /* error ... attempt to resume parsing */
                switch Errflag {
                case 0: /* brand new error */
                        yylex.Error(yyErrorMessage(yystate, yytoken))
                        Nerrs++
                        if yyDebug >= 1 {
                                __yyfmt__.Printf("%s", yyStatname(yystate))
                                __yyfmt__.Printf(" saw %s\n", yyTokname(yytoken))
                        }
                        fallthrough

                case 1, 2: /* incompletely recovered error ... try again */
                        Errflag = 3

                        /* find a state where "error" is a legal shift action */
                        for yyp >= 0 {
                                yyn = int(yyPact[yyS[yyp].yys]) + yyErrCode
                                if yyn >= 0 && yyn < yyLast {
                                        yystate = int(yyAct[yyn]) /* simulate a shift of "error" */
                                        if int(yyChk[yystate]) == yyErrCode {
                                                goto yystack
                                        }
                                }

                                /* the current p has no shift on "error", pop stack */
                                if yyDebug >= 2 {
                                        __yyfmt__.Printf("error recovery pops state %d\n", yyS[yyp].yys)
                                }
                                yyp--
                        }
                        /* there is no state on the stack with an error shift ... abort */
                        goto ret1

                case 3: /* no shift yet; clobber input char */
                        if yyDebug >= 2 {
                                __yyfmt__.Printf("error recovery discards %s\n", yyTokname(yytoken))
                        }
                        if yytoken == yyEofCode {
                                goto ret1
                        }
                        yyrcvr.char = -1
                        yytoken = -1
                        goto yynewstate /* try again in the same state */
                }
        }

        /* reduction by production yyn */
        if yyDebug >= 2 {
                __yyfmt__.Printf("reduce %v in:\n\t%v\n", yyn, yyStatname(yystate))
        }

        yynt := yyn
        yypt := yyp
        _ = yypt // guard against "declared and not used"

        yyp -= int(yyR2[yyn])
        // yyp is now the index of $0. Perform the default action. Iff the
        // reduced production is ε, $1 is possibly out of range.
        if yyp+1 >= len(yyS) {
                nyys := make([]yySymType, len(yyS)*2)
                copy(nyys, yyS)
                yyS = nyys
        }
        yyVAL = yyS[yyp+1]

        /* consult goto table to find next state */
        yyn = int(yyR1[yyn])
        yyg := int(yyPgo[yyn])
        yyj := yyg + yyS[yyp].yys + 1

        if yyj >= yyLast {
                yystate = int(yyAct[yyg])
        } else {
                yystate = int(yyAct[yyj])
                if int(yyChk[yystate]) != -yyn {
                        yystate = int(yyAct[yyg])
                }
        }
        // dummy call; replaced with literal code
        switch yynt {

        case 1:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).generatedParserResult = yyDollar[2].labels
                }
        case 3:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).addParseErrf(posrange.PositionRange{}, "no expression found in input")
                }
        case 4:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).generatedParserResult = yyDollar[2].node
                }
        case 5:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).generatedParserResult = yyDollar[2].node
                }
        case 7:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yylex.(*parser).unexpected("", "")
                }
        case 20:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newAggregateExpr(yyDollar[1].item, yyDollar[2].node, yyDollar[3].node)
                }
        case 21:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newAggregateExpr(yyDollar[1].item, yyDollar[3].node, yyDollar[2].node)
                }
        case 22:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newAggregateExpr(yyDollar[1].item, &AggregateExpr{}, yyDollar[2].node)
                }
        case 23:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).unexpected("aggregation", "")
                        yyVAL.node = yylex.(*parser).newAggregateExpr(yyDollar[1].item, &AggregateExpr{}, Expressions{})
                }
        case 24:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.node = &AggregateExpr{
                                Grouping: yyDollar[2].strings,
                        }
                }
        case 25:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.node = &AggregateExpr{
                                Grouping: yyDollar[2].strings,
                                Without:  true,
                        }
                }
        case 26:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 27:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 28:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 29:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 30:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 31:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 32:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 33:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 34:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 35:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 36:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 37:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 38:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 39:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 40:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 41:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = yylex.(*parser).newBinaryExpression(yyDollar[1].node, yyDollar[2].item, yyDollar[3].node, yyDollar[4].node)
                }
        case 43:
                yyDollar = yyS[yypt-0 : yypt+1]
                {
                        yyVAL.node = &BinaryExpr{
                                VectorMatching: &VectorMatching{Card: CardOneToOne},
                        }
                }
        case 44:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.node = &BinaryExpr{
                                VectorMatching: &VectorMatching{Card: CardOneToOne},
                                ReturnBool:     true,
                        }
                }
        case 45:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = yyDollar[1].node
                        yyVAL.node.(*BinaryExpr).VectorMatching.MatchingLabels = yyDollar[3].strings
                }
        case 46:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = yyDollar[1].node
                        yyVAL.node.(*BinaryExpr).VectorMatching.MatchingLabels = yyDollar[3].strings
                        yyVAL.node.(*BinaryExpr).VectorMatching.On = true
                }
        case 49:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = yyDollar[1].node
                        yyVAL.node.(*BinaryExpr).VectorMatching.Card = CardManyToOne
                        yyVAL.node.(*BinaryExpr).VectorMatching.Include = yyDollar[3].strings
                }
        case 50:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = yyDollar[1].node
                        yyVAL.node.(*BinaryExpr).VectorMatching.Card = CardOneToMany
                        yyVAL.node.(*BinaryExpr).VectorMatching.Include = yyDollar[3].strings
                }
        case 51:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.strings = yyDollar[2].strings
                }
        case 52:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.strings = yyDollar[2].strings
                }
        case 53:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.strings = []string{}
                }
        case 54:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yylex.(*parser).unexpected("grouping opts", "\"(\"")
                        yyVAL.strings = nil
                }
        case 55:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.strings = append(yyDollar[1].strings, yyDollar[3].item.Val)
                }
        case 56:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.strings = []string{yyDollar[1].item.Val}
                }
        case 57:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).unexpected("grouping opts", "\",\" or \")\"")
                        yyVAL.strings = yyDollar[1].strings
                }
        case 58:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        if !isLabel(yyDollar[1].item.Val) {
                                yylex.(*parser).unexpected("grouping opts", "label")
                        }
                        yyVAL.item = yyDollar[1].item
                }
        case 59:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yylex.(*parser).unexpected("grouping opts", "label")
                        yyVAL.item = Item{}
                }
        case 60:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        fn, exist := getFunction(yyDollar[1].item.Val, yylex.(*parser).functions)
                        if !exist {
                                yylex.(*parser).addParseErrf(yyDollar[1].item.PositionRange(), "unknown function with name %q", yyDollar[1].item.Val)
                        }
                        if fn != nil && fn.Experimental && !EnableExperimentalFunctions {
                                yylex.(*parser).addParseErrf(yyDollar[1].item.PositionRange(), "function %q is not enabled", yyDollar[1].item.Val)
                        }
                        yyVAL.node = &Call{
                                Func: fn,
                                Args: yyDollar[2].node.(Expressions),
                                PosRange: posrange.PositionRange{
                                        Start: yyDollar[1].item.Pos,
                                        End:   yylex.(*parser).lastClosing,
                                },
                        }
                }
        case 61:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = yyDollar[2].node
                }
        case 62:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.node = Expressions{}
                }
        case 63:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = append(yyDollar[1].node.(Expressions), yyDollar[3].node.(Expr))
                }
        case 64:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.node = Expressions{yyDollar[1].node.(Expr)}
                }
        case 65:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).addParseErrf(yyDollar[2].item.PositionRange(), "trailing commas not allowed in function call args")
                        yyVAL.node = yyDollar[1].node
                }
        case 66:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = &ParenExpr{Expr: yyDollar[2].node.(Expr), PosRange: mergeRanges(&yyDollar[1].item, &yyDollar[3].item)}
                }
        case 67:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yylex.(*parser).addOffset(yyDollar[1].node, yyDollar[3].duration)
                        yyVAL.node = yyDollar[1].node
                }
        case 68:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yylex.(*parser).addOffset(yyDollar[1].node, -yyDollar[4].duration)
                        yyVAL.node = yyDollar[1].node
                }
        case 69:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yylex.(*parser).unexpected("offset", "duration")
                        yyVAL.node = yyDollar[1].node
                }
        case 70:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yylex.(*parser).setTimestamp(yyDollar[1].node, yyDollar[3].float)
                        yyVAL.node = yyDollar[1].node
                }
        case 71:
                yyDollar = yyS[yypt-5 : yypt+1]
                {
                        yylex.(*parser).setAtModifierPreprocessor(yyDollar[1].node, yyDollar[3].item)
                        yyVAL.node = yyDollar[1].node
                }
        case 72:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yylex.(*parser).unexpected("@", "timestamp")
                        yyVAL.node = yyDollar[1].node
                }
        case 75:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        var errMsg string
                        vs, ok := yyDollar[1].node.(*VectorSelector)
                        if !ok {
                                errMsg = "ranges only allowed for vector selectors"
                        } else if vs.OriginalOffset != 0 {
                                errMsg = "no offset modifiers allowed before range"
                        } else if vs.Timestamp != nil {
                                errMsg = "no @ modifiers allowed before range"
                        }

                        if errMsg != "" {
                                errRange := mergeRanges(&yyDollar[2].item, &yyDollar[4].item)
                                yylex.(*parser).addParseErrf(errRange, errMsg)
                        }

                        yyVAL.node = &MatrixSelector{
                                VectorSelector: yyDollar[1].node.(Expr),
                                Range:          yyDollar[3].duration,
                                EndPos:         yylex.(*parser).lastClosing,
                        }
                }
        case 76:
                yyDollar = yyS[yypt-6 : yypt+1]
                {
                        yyVAL.node = &SubqueryExpr{
                                Expr:  yyDollar[1].node.(Expr),
                                Range: yyDollar[3].duration,
                                Step:  yyDollar[5].duration,

                                EndPos: yyDollar[6].item.Pos + 1,
                        }
                }
        case 77:
                yyDollar = yyS[yypt-6 : yypt+1]
                {
                        yylex.(*parser).unexpected("subquery selector", "\"]\"")
                        yyVAL.node = yyDollar[1].node
                }
        case 78:
                yyDollar = yyS[yypt-5 : yypt+1]
                {
                        yylex.(*parser).unexpected("subquery selector", "duration or \"]\"")
                        yyVAL.node = yyDollar[1].node
                }
        case 79:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yylex.(*parser).unexpected("subquery or range", "\":\" or \"]\"")
                        yyVAL.node = yyDollar[1].node
                }
        case 80:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yylex.(*parser).unexpected("subquery selector", "duration")
                        yyVAL.node = yyDollar[1].node
                }
        case 81:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        if nl, ok := yyDollar[2].node.(*NumberLiteral); ok {
                                if yyDollar[1].item.Typ == SUB {
                                        nl.Val *= -1
                                }
                                nl.PosRange.Start = yyDollar[1].item.Pos
                                yyVAL.node = nl
                        } else {
                                yyVAL.node = &UnaryExpr{Op: yyDollar[1].item.Typ, Expr: yyDollar[2].node.(Expr), StartPos: yyDollar[1].item.Pos}
                        }
                }
        case 82:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        vs := yyDollar[2].node.(*VectorSelector)
                        vs.PosRange = mergeRanges(&yyDollar[1].item, vs)
                        vs.Name = yyDollar[1].item.Val
                        yylex.(*parser).assembleVectorSelector(vs)
                        yyVAL.node = vs
                }
        case 83:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        vs := &VectorSelector{
                                Name:          yyDollar[1].item.Val,
                                LabelMatchers: []*labels.Matcher{},
                                PosRange:      yyDollar[1].item.PositionRange(),
                        }
                        yylex.(*parser).assembleVectorSelector(vs)
                        yyVAL.node = vs
                }
        case 84:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        vs := yyDollar[1].node.(*VectorSelector)
                        yylex.(*parser).assembleVectorSelector(vs)
                        yyVAL.node = vs
                }
        case 85:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.node = &VectorSelector{
                                LabelMatchers: yyDollar[2].matchers,
                                PosRange:      mergeRanges(&yyDollar[1].item, &yyDollar[3].item),
                        }
                }
        case 86:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.node = &VectorSelector{
                                LabelMatchers: yyDollar[2].matchers,
                                PosRange:      mergeRanges(&yyDollar[1].item, &yyDollar[4].item),
                        }
                }
        case 87:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.node = &VectorSelector{
                                LabelMatchers: []*labels.Matcher{},
                                PosRange:      mergeRanges(&yyDollar[1].item, &yyDollar[2].item),
                        }
                }
        case 88:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        if yyDollar[1].matchers != nil {
                                yyVAL.matchers = append(yyDollar[1].matchers, yyDollar[3].matcher)
                        } else {
                                yyVAL.matchers = yyDollar[1].matchers
                        }
                }
        case 89:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.matchers = []*labels.Matcher{yyDollar[1].matcher}
                }
        case 90:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).unexpected("label matching", "\",\" or \"}\"")
                        yyVAL.matchers = yyDollar[1].matchers
                }
        case 91:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.matcher = yylex.(*parser).newLabelMatcher(yyDollar[1].item, yyDollar[2].item, yyDollar[3].item)
                }
        case 92:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.matcher = yylex.(*parser).newLabelMatcher(yyDollar[1].item, yyDollar[2].item, yyDollar[3].item)
                }
        case 93:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.matcher = yylex.(*parser).newMetricNameMatcher(yyDollar[1].item)
                }
        case 94:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yylex.(*parser).unexpected("label matching", "string")
                        yyVAL.matcher = nil
                }
        case 95:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yylex.(*parser).unexpected("label matching", "string")
                        yyVAL.matcher = nil
                }
        case 96:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).unexpected("label matching", "label matching operator")
                        yyVAL.matcher = nil
                }
        case 97:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yylex.(*parser).unexpected("label matching", "identifier or \"}\"")
                        yyVAL.matcher = nil
                }
        case 98:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        b := labels.NewBuilder(yyDollar[2].labels)
                        b.Set(labels.MetricName, yyDollar[1].item.Val)
                        yyVAL.labels = b.Labels()
                }
        case 99:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.labels = yyDollar[1].labels
                }
        case 122:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.labels = labels.New(yyDollar[2].lblList...)
                }
        case 123:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.labels = labels.New(yyDollar[2].lblList...)
                }
        case 124:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.labels = labels.New()
                }
        case 125:
                yyDollar = yyS[yypt-0 : yypt+1]
                {
                        yyVAL.labels = labels.New()
                }
        case 126:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.lblList = append(yyDollar[1].lblList, yyDollar[3].label)
                }
        case 127:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.lblList = []labels.Label{yyDollar[1].label}
                }
        case 128:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).unexpected("label set", "\",\" or \"}\"")
                        yyVAL.lblList = yyDollar[1].lblList
                }
        case 129:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.label = labels.Label{Name: yyDollar[1].item.Val, Value: yylex.(*parser).unquoteString(yyDollar[3].item.Val)}
                }
        case 130:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yylex.(*parser).unexpected("label set", "string")
                        yyVAL.label = labels.Label{}
                }
        case 131:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).unexpected("label set", "\"=\"")
                        yyVAL.label = labels.Label{}
                }
        case 132:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yylex.(*parser).unexpected("label set", "identifier or \"}\"")
                        yyVAL.label = labels.Label{}
                }
        case 133:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).generatedParserResult = &seriesDescription{
                                labels: yyDollar[1].labels,
                                values: yyDollar[2].series,
                        }
                }
        case 134:
                yyDollar = yyS[yypt-0 : yypt+1]
                {
                        yyVAL.series = []SequenceValue{}
                }
        case 135:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.series = append(yyDollar[1].series, yyDollar[3].series...)
                }
        case 136:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.series = yyDollar[1].series
                }
        case 137:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yylex.(*parser).unexpected("series values", "")
                        yyVAL.series = nil
                }
        case 138:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.series = []SequenceValue{{Omitted: true}}
                }
        case 139:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.series = []SequenceValue{}
                        for i := uint64(0); i < yyDollar[3].uint; i++ {
                                yyVAL.series = append(yyVAL.series, SequenceValue{Omitted: true})
                        }
                }
        case 140:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.series = []SequenceValue{{Value: yyDollar[1].float}}
                }
        case 141:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.series = []SequenceValue{}
                        // Add an additional value for time 0, which we ignore in tests.
                        for i := uint64(0); i <= yyDollar[3].uint; i++ {
                                yyVAL.series = append(yyVAL.series, SequenceValue{Value: yyDollar[1].float})
                        }
                }
        case 142:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.series = []SequenceValue{}
                        // Add an additional value for time 0, which we ignore in tests.
                        for i := uint64(0); i <= yyDollar[4].uint; i++ {
                                yyVAL.series = append(yyVAL.series, SequenceValue{Value: yyDollar[1].float})
                                yyDollar[1].float += yyDollar[2].float
                        }
                }
        case 143:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.series = []SequenceValue{{Histogram: yyDollar[1].histogram}}
                }
        case 144:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.series = []SequenceValue{}
                        // Add an additional value for time 0, which we ignore in tests.
                        for i := uint64(0); i <= yyDollar[3].uint; i++ {
                                yyVAL.series = append(yyVAL.series, SequenceValue{Histogram: yyDollar[1].histogram})
                                //$1 += $2
                        }
                }
        case 145:
                yyDollar = yyS[yypt-5 : yypt+1]
                {
                        val, err := yylex.(*parser).histogramsIncreaseSeries(yyDollar[1].histogram, yyDollar[3].histogram, yyDollar[5].uint)
                        if err != nil {
                                yylex.(*parser).addSemanticError(err)
                        }
                        yyVAL.series = val
                }
        case 146:
                yyDollar = yyS[yypt-5 : yypt+1]
                {
                        val, err := yylex.(*parser).histogramsDecreaseSeries(yyDollar[1].histogram, yyDollar[3].histogram, yyDollar[5].uint)
                        if err != nil {
                                yylex.(*parser).addSemanticError(err)
                        }
                        yyVAL.series = val
                }
        case 147:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        if yyDollar[1].item.Val != "stale" {
                                yylex.(*parser).unexpected("series values", "number or \"stale\"")
                        }
                        yyVAL.float = math.Float64frombits(value.StaleNaN)
                }
        case 150:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.histogram = yylex.(*parser).buildHistogramFromMap(&yyDollar[2].descriptors)
                }
        case 151:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.histogram = yylex.(*parser).buildHistogramFromMap(&yyDollar[2].descriptors)
                }
        case 152:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        m := yylex.(*parser).newMap()
                        yyVAL.histogram = yylex.(*parser).buildHistogramFromMap(&m)
                }
        case 153:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        m := yylex.(*parser).newMap()
                        yyVAL.histogram = yylex.(*parser).buildHistogramFromMap(&m)
                }
        case 154:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = *(yylex.(*parser).mergeMaps(&yyDollar[1].descriptors, &yyDollar[3].descriptors))
                }
        case 155:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.descriptors = yyDollar[1].descriptors
                }
        case 156:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yylex.(*parser).unexpected("histogram description", "histogram description key, e.g. buckets:[5 10 7]")
                }
        case 157:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["schema"] = yyDollar[3].int
                }
        case 158:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["sum"] = yyDollar[3].float
                }
        case 159:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["count"] = yyDollar[3].float
                }
        case 160:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["z_bucket"] = yyDollar[3].float
                }
        case 161:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["z_bucket_w"] = yyDollar[3].float
                }
        case 162:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["custom_values"] = yyDollar[3].bucket_set
                }
        case 163:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["buckets"] = yyDollar[3].bucket_set
                }
        case 164:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["offset"] = yyDollar[3].int
                }
        case 165:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["n_buckets"] = yyDollar[3].bucket_set
                }
        case 166:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.descriptors = yylex.(*parser).newMap()
                        yyVAL.descriptors["n_offset"] = yyDollar[3].int
                }
        case 167:
                yyDollar = yyS[yypt-4 : yypt+1]
                {
                        yyVAL.bucket_set = yyDollar[2].bucket_set
                }
        case 168:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.bucket_set = yyDollar[2].bucket_set
                }
        case 169:
                yyDollar = yyS[yypt-3 : yypt+1]
                {
                        yyVAL.bucket_set = append(yyDollar[1].bucket_set, yyDollar[3].float)
                }
        case 170:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.bucket_set = []float64{yyDollar[1].float}
                }
        case 217:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.node = &NumberLiteral{
                                Val:      yylex.(*parser).number(yyDollar[1].item.Val),
                                PosRange: yyDollar[1].item.PositionRange(),
                        }
                }
        case 218:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.float = yylex.(*parser).number(yyDollar[1].item.Val)
                }
        case 219:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.float = yyDollar[2].float
                }
        case 220:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.float = -yyDollar[2].float
                }
        case 223:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        var err error
                        yyVAL.uint, err = strconv.ParseUint(yyDollar[1].item.Val, 10, 64)
                        if err != nil {
                                yylex.(*parser).addParseErrf(yyDollar[1].item.PositionRange(), "invalid repetition in series values: %s", err)
                        }
                }
        case 224:
                yyDollar = yyS[yypt-2 : yypt+1]
                {
                        yyVAL.int = -int64(yyDollar[2].uint)
                }
        case 225:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.int = int64(yyDollar[1].uint)
                }
        case 226:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        var err error
                        yyVAL.duration, err = parseDuration(yyDollar[1].item.Val)
                        if err != nil {
                                yylex.(*parser).addParseErr(yyDollar[1].item.PositionRange(), err)
                        }
                }
        case 227:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.node = &StringLiteral{
                                Val:      yylex.(*parser).unquoteString(yyDollar[1].item.Val),
                                PosRange: yyDollar[1].item.PositionRange(),
                        }
                }
        case 228:
                yyDollar = yyS[yypt-1 : yypt+1]
                {
                        yyVAL.item = Item{
                                Typ: METRIC_IDENTIFIER,
                                Pos: yyDollar[1].item.PositionRange().Start,
                                Val: yylex.(*parser).unquoteString(yyDollar[1].item.Val),
                        }
                }
        case 229:
                yyDollar = yyS[yypt-0 : yypt+1]
                {
                        yyVAL.duration = 0
                }
        case 231:
                yyDollar = yyS[yypt-0 : yypt+1]
                {
                        yyVAL.strings = nil
                }
        }
        goto yystack /* stack new state and value */
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package parser

import (
        "fmt"
        "strings"
        "unicode"
        "unicode/utf8"

        "github.com/prometheus/prometheus/promql/parser/posrange"
)

// Item represents a token or text string returned from the scanner.
type Item struct {
        Typ ItemType     // The type of this Item.
        Pos posrange.Pos // The starting position, in bytes, of this Item in the input string.
        Val string       // The value of this Item.
}

// String returns a descriptive string for the Item.
func (i Item) String() string {
        switch {
        case i.Typ == EOF:
                return "EOF"
        case i.Typ == ERROR:
                return i.Val
        case i.Typ == IDENTIFIER || i.Typ == METRIC_IDENTIFIER:
                return fmt.Sprintf("%q", i.Val)
        case i.Typ.IsKeyword():
                return fmt.Sprintf("<%s>", i.Val)
        case i.Typ.IsOperator():
                return fmt.Sprintf("<op:%s>", i.Val)
        case i.Typ.IsAggregator():
                return fmt.Sprintf("<aggr:%s>", i.Val)
        case len(i.Val) > 10:
                return fmt.Sprintf("%.10q...", i.Val)
        }
        return fmt.Sprintf("%q", i.Val)
}

// Pretty returns the prettified form of an item.
// This is same as the item's stringified format.
func (i Item) Pretty(int) string { return i.String() }

// IsOperator returns true if the Item corresponds to a arithmetic or set operator.
// Returns false otherwise.
func (i ItemType) IsOperator() bool { return i > operatorsStart && i < operatorsEnd }

// IsAggregator returns true if the Item belongs to the aggregator functions.
// Returns false otherwise.
func (i ItemType) IsAggregator() bool { return i > aggregatorsStart && i < aggregatorsEnd }

// IsAggregatorWithParam returns true if the Item is an aggregator that takes a parameter.
// Returns false otherwise.
func (i ItemType) IsAggregatorWithParam() bool {
        return i == TOPK || i == BOTTOMK || i == COUNT_VALUES || i == QUANTILE
}

// IsKeyword returns true if the Item corresponds to a keyword.
// Returns false otherwise.
func (i ItemType) IsKeyword() bool { return i > keywordsStart && i < keywordsEnd }

// IsComparisonOperator returns true if the Item corresponds to a comparison operator.
// Returns false otherwise.
func (i ItemType) IsComparisonOperator() bool {
        switch i {
        case EQLC, NEQ, LTE, LSS, GTE, GTR:
                return true
        default:
                return false
        }
}

// IsSetOperator returns whether the Item corresponds to a set operator.
func (i ItemType) IsSetOperator() bool {
        switch i {
        case LAND, LOR, LUNLESS:
                return true
        }
        return false
}

type ItemType int

// This is a list of all keywords in PromQL.
// When changing this list, make sure to also change
// the maybe_label grammar rule in the generated parser
// to avoid misinterpretation of labels as keywords.
var key = map[string]ItemType{
        // Operators.
        "and":    LAND,
        "or":     LOR,
        "unless": LUNLESS,
        "atan2":  ATAN2,

        // Aggregators.
        "sum":          SUM,
        "avg":          AVG,
        "count":        COUNT,
        "min":          MIN,
        "max":          MAX,
        "group":        GROUP,
        "stddev":       STDDEV,
        "stdvar":       STDVAR,
        "topk":         TOPK,
        "bottomk":      BOTTOMK,
        "count_values": COUNT_VALUES,
        "quantile":     QUANTILE,

        // Keywords.
        "offset":      OFFSET,
        "by":          BY,
        "without":     WITHOUT,
        "on":          ON,
        "ignoring":    IGNORING,
        "group_left":  GROUP_LEFT,
        "group_right": GROUP_RIGHT,
        "bool":        BOOL,

        // Preprocessors.
        "start": START,
        "end":   END,
}

var histogramDesc = map[string]ItemType{
        "sum":           SUM_DESC,
        "count":         COUNT_DESC,
        "schema":        SCHEMA_DESC,
        "offset":        OFFSET_DESC,
        "n_offset":      NEGATIVE_OFFSET_DESC,
        "buckets":       BUCKETS_DESC,
        "n_buckets":     NEGATIVE_BUCKETS_DESC,
        "z_bucket":      ZERO_BUCKET_DESC,
        "z_bucket_w":    ZERO_BUCKET_WIDTH_DESC,
        "custom_values": CUSTOM_VALUES_DESC,
}

// ItemTypeStr is the default string representations for common Items. It does not
// imply that those are the only character sequences that can be lexed to such an Item.
var ItemTypeStr = map[ItemType]string{
        OPEN_HIST:     "{{",
        CLOSE_HIST:    "}}",
        LEFT_PAREN:    "(",
        RIGHT_PAREN:   ")",
        LEFT_BRACE:    "{",
        RIGHT_BRACE:   "}",
        LEFT_BRACKET:  "[",
        RIGHT_BRACKET: "]",
        COMMA:         ",",
        EQL:           "=",
        COLON:         ":",
        SEMICOLON:     ";",
        BLANK:         "_",
        TIMES:         "x",
        SPACE:         "<space>",

        SUB:       "-",
        ADD:       "+",
        MUL:       "*",
        MOD:       "%",
        DIV:       "/",
        EQLC:      "==",
        NEQ:       "!=",
        LTE:       "<=",
        LSS:       "<",
        GTE:       ">=",
        GTR:       ">",
        EQL_REGEX: "=~",
        NEQ_REGEX: "!~",
        POW:       "^",
}

func init() {
        // Add keywords to Item type strings.
        for s, ty := range key {
                ItemTypeStr[ty] = s
        }
        // Special numbers.
        key["inf"] = NUMBER
        key["nan"] = NUMBER
}

func (i ItemType) String() string {
        if s, ok := ItemTypeStr[i]; ok {
                return s
        }
        return fmt.Sprintf("<Item %d>", i)
}

func (i Item) desc() string {
        if _, ok := ItemTypeStr[i.Typ]; ok {
                return i.String()
        }
        if i.Typ == EOF {
                return i.Typ.desc()
        }
        return fmt.Sprintf("%s %s", i.Typ.desc(), i)
}

func (i ItemType) desc() string {
        switch i {
        case ERROR:
                return "error"
        case EOF:
                return "end of input"
        case COMMENT:
                return "comment"
        case IDENTIFIER:
                return "identifier"
        case METRIC_IDENTIFIER:
                return "metric identifier"
        case STRING:
                return "string"
        case NUMBER:
                return "number"
        case DURATION:
                return "duration"
        }
        return fmt.Sprintf("%q", i)
}

const eof = -1

// stateFn represents the state of the scanner as a function that returns the next state.
type stateFn func(*Lexer) stateFn

type histogramState int

const (
        histogramStateNone histogramState = iota
        histogramStateOpen
        histogramStateMul
        histogramStateAdd
        histogramStateSub
)

// Lexer holds the state of the scanner.
type Lexer struct {
        input       string       // The string being scanned.
        state       stateFn      // The next lexing function to enter.
        pos         posrange.Pos // Current position in the input.
        start       posrange.Pos // Start position of this Item.
        width       posrange.Pos // Width of last rune read from input.
        lastPos     posrange.Pos // Position of most recent Item returned by NextItem.
        itemp       *Item        // Pointer to where the next scanned item should be placed.
        scannedItem bool         // Set to true every time an item is scanned.

        parenDepth  int  // Nesting depth of ( ) exprs.
        braceOpen   bool // Whether a { is opened.
        bracketOpen bool // Whether a [ is opened.
        gotColon    bool // Whether we got a ':' after [ was opened.
        stringOpen  rune // Quote rune of the string currently being read.

        // series description variables for internal PromQL testing framework as well as in promtool rules unit tests.
        // see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#series
        seriesDesc     bool           // Whether we are lexing a series description.
        histogramState histogramState // Determines whether or not inside of a histogram description.
}

// next returns the next rune in the input.
func (l *Lexer) next() rune {
        if int(l.pos) >= len(l.input) {
                l.width = 0
                return eof
        }
        r, w := utf8.DecodeRuneInString(l.input[l.pos:])
        l.width = posrange.Pos(w)
        l.pos += l.width
        return r
}

// peek returns but does not consume the next rune in the input.
func (l *Lexer) peek() rune {
        r := l.next()
        l.backup()
        return r
}

// backup steps back one rune. Can only be called once per call of next.
func (l *Lexer) backup() {
        l.pos -= l.width
}

// emit passes an Item back to the client.
func (l *Lexer) emit(t ItemType) {
        *l.itemp = Item{t, l.start, l.input[l.start:l.pos]}
        l.start = l.pos
        l.scannedItem = true
}

// ignore skips over the pending input before this point.
func (l *Lexer) ignore() {
        l.start = l.pos
}

// accept consumes the next rune if it's from the valid set.
func (l *Lexer) accept(valid string) bool {
        if strings.ContainsRune(valid, l.next()) {
                return true
        }
        l.backup()
        return false
}

// is peeks and returns true if the next rune is contained in the provided string.
func (l *Lexer) is(valid string) bool {
        return strings.ContainsRune(valid, l.peek())
}

// acceptRun consumes a run of runes from the valid set.
func (l *Lexer) acceptRun(valid string) {
        for strings.ContainsRune(valid, l.next()) {
                // Consume.
        }
        l.backup()
}

// errorf returns an error token and terminates the scan by passing
// back a nil pointer that will be the next state, terminating l.NextItem.
func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
        *l.itemp = Item{ERROR, l.start, fmt.Sprintf(format, args...)}
        l.scannedItem = true

        return nil
}

// NextItem writes the next item to the provided address.
func (l *Lexer) NextItem(itemp *Item) {
        l.scannedItem = false
        l.itemp = itemp

        if l.state != nil {
                for !l.scannedItem {
                        l.state = l.state(l)
                }
        } else {
                l.emit(EOF)
        }

        l.lastPos = l.itemp.Pos
}

// Lex creates a new scanner for the input string.
func Lex(input string) *Lexer {
        l := &Lexer{
                input: input,
                state: lexStatements,
        }
        return l
}

// lineComment is the character that starts a line comment.
const lineComment = "#"

// lexStatements is the top-level state for lexing.
func lexStatements(l *Lexer) stateFn {
        if l.histogramState != histogramStateNone {
                return lexHistogram
        }
        if l.braceOpen {
                return lexInsideBraces
        }
        if strings.HasPrefix(l.input[l.pos:], lineComment) {
                return lexLineComment
        }

        switch r := l.next(); {
        case r == eof:
                switch {
                case l.parenDepth != 0:
                        return l.errorf("unclosed left parenthesis")
                case l.bracketOpen:
                        return l.errorf("unclosed left bracket")
                }
                l.emit(EOF)
                return nil
        case r == ',':
                l.emit(COMMA)
        case isSpace(r):
                return lexSpace
        case r == '*':
                l.emit(MUL)
        case r == '/':
                l.emit(DIV)
        case r == '%':
                l.emit(MOD)
        case r == '+':
                l.emit(ADD)
        case r == '-':
                l.emit(SUB)
        case r == '^':
                l.emit(POW)
        case r == '=':
                switch t := l.peek(); t {
                case '=':
                        l.next()
                        l.emit(EQLC)
                case '~':
                        return l.errorf("unexpected character after '=': %q", t)
                default:
                        l.emit(EQL)
                }
        case r == '!':
                if t := l.next(); t == '=' {
                        l.emit(NEQ)
                } else {
                        return l.errorf("unexpected character after '!': %q", t)
                }
        case r == '<':
                if t := l.peek(); t == '=' {
                        l.next()
                        l.emit(LTE)
                } else {
                        l.emit(LSS)
                }
        case r == '>':
                if t := l.peek(); t == '=' {
                        l.next()
                        l.emit(GTE)
                } else {
                        l.emit(GTR)
                }
        case isDigit(r) || (r == '.' && isDigit(l.peek())):
                l.backup()
                return lexNumberOrDuration
        case r == '"' || r == '\'':
                l.stringOpen = r
                return lexString
        case r == '`':
                l.stringOpen = r
                return lexRawString
        case isAlpha(r) || r == ':':
                if !l.bracketOpen {
                        l.backup()
                        return lexKeywordOrIdentifier
                }
                if l.gotColon {
                        return l.errorf("unexpected colon %q", r)
                }
                l.emit(COLON)
                l.gotColon = true
        case r == '(':
                l.emit(LEFT_PAREN)
                l.parenDepth++
                return lexStatements
        case r == ')':
                l.emit(RIGHT_PAREN)
                l.parenDepth--
                if l.parenDepth < 0 {
                        return l.errorf("unexpected right parenthesis %q", r)
                }
                return lexStatements
        case r == '{':
                l.emit(LEFT_BRACE)
                l.braceOpen = true
                return lexInsideBraces
        case r == '[':
                if l.bracketOpen {
                        return l.errorf("unexpected left bracket %q", r)
                }
                l.gotColon = false
                l.emit(LEFT_BRACKET)
                if isSpace(l.peek()) {
                        skipSpaces(l)
                }
                l.bracketOpen = true
                return lexDuration
        case r == ']':
                if !l.bracketOpen {
                        return l.errorf("unexpected right bracket %q", r)
                }
                l.emit(RIGHT_BRACKET)
                l.bracketOpen = false
        case r == '@':
                l.emit(AT)
        default:
                return l.errorf("unexpected character: %q", r)
        }
        return lexStatements
}

func lexHistogram(l *Lexer) stateFn {
        switch l.histogramState {
        case histogramStateMul:
                l.histogramState = histogramStateNone
                l.next()
                l.emit(TIMES)
                return lexNumber
        case histogramStateAdd:
                l.histogramState = histogramStateNone
                l.next()
                l.emit(ADD)
                return lexValueSequence
        case histogramStateSub:
                l.histogramState = histogramStateNone
                l.next()
                l.emit(SUB)
                return lexValueSequence
        }

        if l.bracketOpen {
                return lexBuckets
        }
        switch r := l.next(); {
        case isSpace(r):
                l.emit(SPACE)
                return lexSpace
        case isAlpha(r):
                l.backup()
                return lexHistogramDescriptor
        case r == ':':
                l.emit(COLON)
                return lexHistogram
        case r == '-':
                l.emit(SUB)
                return lexHistogram
        case r == 'x':
                l.emit(TIMES)
                return lexNumber
        case isDigit(r):
                l.backup()
                return lexNumber
        case r == '[':
                l.bracketOpen = true
                l.emit(LEFT_BRACKET)
                return lexBuckets
        case r == '}' && l.peek() == '}':
                l.next()
                l.emit(CLOSE_HIST)
                switch l.peek() {
                case 'x':
                        l.histogramState = histogramStateMul
                        return lexHistogram
                case '+':
                        l.histogramState = histogramStateAdd
                        return lexHistogram
                case '-':
                        l.histogramState = histogramStateSub
                        return lexHistogram
                default:
                        l.histogramState = histogramStateNone
                        return lexValueSequence
                }
        default:
                return l.errorf("histogram description incomplete unexpected: %q", r)
        }
}

func lexHistogramDescriptor(l *Lexer) stateFn {
Loop:
        for {
                switch r := l.next(); {
                case isAlpha(r):
                        // absorb.
                default:
                        l.backup()

                        word := l.input[l.start:l.pos]
                        if desc, ok := histogramDesc[strings.ToLower(word)]; ok {
                                if l.peek() == ':' {
                                        l.emit(desc)
                                        return lexHistogram
                                }
                                l.errorf("missing `:` for histogram descriptor")
                                break Loop
                        }
                        // Current word is Inf or NaN.
                        if desc, ok := key[strings.ToLower(word)]; ok {
                                if desc == NUMBER {
                                        l.emit(desc)
                                        return lexHistogram
                                }
                        }
                        l.errorf("bad histogram descriptor found: %q", word)
                        break Loop
                }
        }
        return lexStatements
}

func lexBuckets(l *Lexer) stateFn {
        switch r := l.next(); {
        case isSpace(r):
                l.emit(SPACE)
                return lexSpace
        case isDigit(r):
                l.backup()
                return lexNumber
        case r == ']':
                l.bracketOpen = false
                l.emit(RIGHT_BRACKET)
                return lexHistogram
        default:
                return l.errorf("invalid character in buckets description: %q", r)
        }
}

// lexInsideBraces scans the inside of a vector selector. Keywords are ignored and
// scanned as identifiers.
func lexInsideBraces(l *Lexer) stateFn {
        if strings.HasPrefix(l.input[l.pos:], lineComment) {
                return lexLineComment
        }

        switch r := l.next(); {
        case r == eof:
                return l.errorf("unexpected end of input inside braces")
        case isSpace(r):
                return lexSpace
        case isAlpha(r):
                l.backup()
                return lexIdentifier
        case r == ',':
                l.emit(COMMA)
        case r == '"' || r == '\'':
                l.stringOpen = r
                return lexString
        case r == '`':
                l.stringOpen = r
                return lexRawString
        case r == '=':
                if l.next() == '~' {
                        l.emit(EQL_REGEX)
                        break
                }
                l.backup()
                l.emit(EQL)
        case r == '!':
                switch nr := l.next(); {
                case nr == '~':
                        l.emit(NEQ_REGEX)
                case nr == '=':
                        l.emit(NEQ)
                default:
                        return l.errorf("unexpected character after '!' inside braces: %q", nr)
                }
        case r == '{':
                return l.errorf("unexpected left brace %q", r)
        case r == '}':
                l.emit(RIGHT_BRACE)
                l.braceOpen = false

                if l.seriesDesc {
                        return lexValueSequence
                }
                return lexStatements
        default:
                return l.errorf("unexpected character inside braces: %q", r)
        }
        return lexInsideBraces
}

// lexValueSequence scans a value sequence of a series description.
func lexValueSequence(l *Lexer) stateFn {
        if l.histogramState != histogramStateNone {
                return lexHistogram
        }
        switch r := l.next(); {
        case r == eof:
                return lexStatements
        case r == '{' && l.peek() == '{':
                if l.histogramState != histogramStateNone {
                        return l.errorf("unexpected histogram opening {{")
                }
                l.histogramState = histogramStateOpen
                l.next()
                l.emit(OPEN_HIST)
                return lexHistogram
        case isSpace(r):
                l.emit(SPACE)
                lexSpace(l)
        case r == '+':
                l.emit(ADD)
        case r == '-':
                l.emit(SUB)
        case r == 'x':
                l.emit(TIMES)
        case r == '_':
                l.emit(BLANK)
        case isDigit(r) || (r == '.' && isDigit(l.peek())):
                l.backup()
                lexNumber(l)
        case isAlpha(r):
                l.backup()
                // We might lex invalid Items here but this will be caught by the parser.
                return lexKeywordOrIdentifier
        default:
                return l.errorf("unexpected character in series sequence: %q", r)
        }
        return lexValueSequence
}

// lexEscape scans a string escape sequence. The initial escaping character (\)
// has already been seen.
//
// NOTE: This function as well as the helper function digitVal() and associated
// tests have been adapted from the corresponding functions in the "go/scanner"
// package of the Go standard library to work for Prometheus-style strings.
// None of the actual escaping/quoting logic was changed in this function - it
// was only modified to integrate with our lexer.
func lexEscape(l *Lexer) stateFn {
        var n int
        var base, max uint32

        ch := l.next()
        switch ch {
        case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', l.stringOpen:
                return lexString
        case '0', '1', '2', '3', '4', '5', '6', '7':
                n, base, max = 3, 8, 255
        case 'x':
                ch = l.next()
                n, base, max = 2, 16, 255
        case 'u':
                ch = l.next()
                n, base, max = 4, 16, unicode.MaxRune
        case 'U':
                ch = l.next()
                n, base, max = 8, 16, unicode.MaxRune
        case eof:
                l.errorf("escape sequence not terminated")
                return lexString
        default:
                l.errorf("unknown escape sequence %#U", ch)
                return lexString
        }

        var x uint32
        for n > 0 {
                d := uint32(digitVal(ch))
                if d >= base {
                        if ch == eof {
                                l.errorf("escape sequence not terminated")
                                return lexString
                        }
                        l.errorf("illegal character %#U in escape sequence", ch)
                        return lexString
                }
                x = x*base + d
                n--

                // Don't seek after last rune.
                if n > 0 {
                        ch = l.next()
                }
        }

        if x > max || 0xD800 <= x && x < 0xE000 {
                l.errorf("escape sequence is an invalid Unicode code point")
        }
        return lexString
}

// digitVal returns the digit value of a rune or 16 in case the rune does not
// represent a valid digit.
func digitVal(ch rune) int {
        switch {
        case '0' <= ch && ch <= '9':
                return int(ch - '0')
        case 'a' <= ch && ch <= 'f':
                return int(ch - 'a' + 10)
        case 'A' <= ch && ch <= 'F':
                return int(ch - 'A' + 10)
        }
        return 16 // Larger than any legal digit val.
}

// skipSpaces skips the spaces until a non-space is encountered.
func skipSpaces(l *Lexer) {
        for isSpace(l.peek()) {
                l.next()
        }
        l.ignore()
}

// lexString scans a quoted string. The initial quote has already been seen.
func lexString(l *Lexer) stateFn {
Loop:
        for {
                switch l.next() {
                case '\\':
                        return lexEscape
                case utf8.RuneError:
                        l.errorf("invalid UTF-8 rune")
                        return lexString
                case eof, '\n':
                        return l.errorf("unterminated quoted string")
                case l.stringOpen:
                        break Loop
                }
        }
        l.emit(STRING)
        return lexStatements
}

// lexRawString scans a raw quoted string. The initial quote has already been seen.
func lexRawString(l *Lexer) stateFn {
Loop:
        for {
                switch l.next() {
                case utf8.RuneError:
                        l.errorf("invalid UTF-8 rune")
                        return lexRawString
                case eof:
                        l.errorf("unterminated raw string")
                        return lexRawString
                case l.stringOpen:
                        break Loop
                }
        }
        l.emit(STRING)
        return lexStatements
}

// lexSpace scans a run of space characters. One space has already been seen.
func lexSpace(l *Lexer) stateFn {
        for isSpace(l.peek()) {
                l.next()
        }
        l.ignore()
        return lexStatements
}

// lexLineComment scans a line comment. Left comment marker is known to be present.
func lexLineComment(l *Lexer) stateFn {
        l.pos += posrange.Pos(len(lineComment))
        for r := l.next(); !isEndOfLine(r) && r != eof; {
                r = l.next()
        }
        l.backup()
        l.emit(COMMENT)
        return lexStatements
}

func lexDuration(l *Lexer) stateFn {
        if l.scanNumber() {
                return l.errorf("missing unit character in duration")
        }
        if !acceptRemainingDuration(l) {
                return l.errorf("bad duration syntax: %q", l.input[l.start:l.pos])
        }
        l.backup()
        l.emit(DURATION)
        return lexStatements
}

// lexNumber scans a number: decimal, hex, oct or float.
func lexNumber(l *Lexer) stateFn {
        if !l.scanNumber() {
                return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
        }
        l.emit(NUMBER)
        return lexStatements
}

// lexNumberOrDuration scans a number or a duration Item.
func lexNumberOrDuration(l *Lexer) stateFn {
        if l.scanNumber() {
                l.emit(NUMBER)
                return lexStatements
        }
        // Next two chars must be a valid unit and a non-alphanumeric.
        if acceptRemainingDuration(l) {
                l.backup()
                l.emit(DURATION)
                return lexStatements
        }
        return l.errorf("bad number or duration syntax: %q", l.input[l.start:l.pos])
}

func acceptRemainingDuration(l *Lexer) bool {
        // Next two char must be a valid duration.
        if !l.accept("smhdwy") {
                return false
        }
        // Support for ms. Bad units like hs, ys will be caught when we actually
        // parse the duration.
        l.accept("s")
        // Next char can be another number then a unit.
        for l.accept("0123456789") {
                for l.accept("0123456789") {
                }
                // y is no longer in the list as it should always come first in
                // durations.
                if !l.accept("smhdw") {
                        return false
                }
                // Support for ms. Bad units like hs, ys will be caught when we actually
                // parse the duration.
                l.accept("s")
        }
        return !isAlphaNumeric(l.next())
}

// scanNumber scans numbers of different formats. The scanned Item is
// not necessarily a valid number. This case is caught by the parser.
func (l *Lexer) scanNumber() bool {
        // Modify the digit pattern if the number is hexadecimal.
        digitPattern := "0123456789"
        // Disallow hexadecimal in series descriptions as the syntax is ambiguous.
        if !l.seriesDesc &&
                l.accept("0") && l.accept("xX") {
                l.accept("_") // eg., 0X_1FFFP-16 == 0.1249847412109375
                digitPattern = "0123456789abcdefABCDEF"
        }
        const (
                // Define dot, exponent, and underscore patterns.
                dotPattern        = "."
                exponentPattern   = "eE"
                underscorePattern = "_"
                // Anti-patterns are rune sets that cannot follow their respective rune.
                dotAntiPattern        = "_."
                exponentAntiPattern   = "._eE" // and EOL.
                underscoreAntiPattern = "._eE" // and EOL.
        )
        // All numbers follow the prefix: [.][d][d._eE]*
        l.accept(dotPattern)
        l.accept(digitPattern)
        // [d._eE]* hereon.
        dotConsumed := false
        exponentConsumed := false
        for l.is(digitPattern + dotPattern + underscorePattern + exponentPattern) {
                // "." cannot repeat.
                if l.is(dotPattern) {
                        if dotConsumed {
                                l.accept(dotPattern)
                                return false
                        }
                }
                // "eE" cannot repeat.
                if l.is(exponentPattern) {
                        if exponentConsumed {
                                l.accept(exponentPattern)
                                return false
                        }
                }
                // Handle dots.
                if l.accept(dotPattern) {
                        dotConsumed = true
                        if l.accept(dotAntiPattern) {
                                return false
                        }
                        // Fractional hexadecimal literals are not allowed.
                        if len(digitPattern) > 10 /* 0x[\da-fA-F].[\d]+p[\d] */ {
                                return false
                        }
                        continue
                }
                // Handle exponents.
                if l.accept(exponentPattern) {
                        exponentConsumed = true
                        l.accept("+-")
                        if l.accept(exponentAntiPattern) || l.peek() == eof {
                                return false
                        }
                        continue
                }
                // Handle underscores.
                if l.accept(underscorePattern) {
                        if l.accept(underscoreAntiPattern) || l.peek() == eof {
                                return false
                        }

                        continue
                }
                // Handle digits at the end since we already consumed before this loop.
                l.acceptRun(digitPattern)
        }

        // Next thing must not be alphanumeric unless it's the times token
        // for series repetitions.
        if r := l.peek(); (l.seriesDesc && r == 'x') || !isAlphaNumeric(r) {
                return true
        }
        return false
}

// lexIdentifier scans an alphanumeric identifier. The next character
// is known to be a letter.
func lexIdentifier(l *Lexer) stateFn {
        for isAlphaNumeric(l.next()) {
                // absorb
        }
        l.backup()
        l.emit(IDENTIFIER)
        return lexStatements
}

// lexKeywordOrIdentifier scans an alphanumeric identifier which may contain
// a colon rune. If the identifier is a keyword the respective keyword Item
// is scanned.
func lexKeywordOrIdentifier(l *Lexer) stateFn {
Loop:
        for {
                switch r := l.next(); {
                case isAlphaNumeric(r) || r == ':':
                        // absorb.
                default:
                        l.backup()
                        word := l.input[l.start:l.pos]
                        switch kw, ok := key[strings.ToLower(word)]; {
                        case ok:
                                l.emit(kw)
                        case !strings.Contains(word, ":"):
                                l.emit(IDENTIFIER)
                        default:
                                l.emit(METRIC_IDENTIFIER)
                        }
                        break Loop
                }
        }
        if l.seriesDesc && l.peek() != '{' {
                return lexValueSequence
        }
        return lexStatements
}

func isSpace(r rune) bool {
        return r == ' ' || r == '\t' || r == '\n' || r == '\r'
}

// isEndOfLine reports whether r is an end-of-line character.
func isEndOfLine(r rune) bool {
        return r == '\r' || r == '\n'
}

// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
func isAlphaNumeric(r rune) bool {
        return isAlpha(r) || isDigit(r)
}

// isDigit reports whether r is a digit. Note: we cannot use unicode.IsDigit()
// instead because that also classifies non-Latin digits as digits. See
// https://github.com/prometheus/prometheus/issues/939.
func isDigit(r rune) bool {
        return '0' <= r && r <= '9'
}

// isAlpha reports whether r is an alphabetic or underscore.
func isAlpha(r rune) bool {
        return r == '_' || ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z')
}

// isLabel reports whether the string can be used as label.
func isLabel(s string) bool {
        if len(s) == 0 || !isAlpha(rune(s[0])) {
                return false
        }
        for _, c := range s[1:] {
                if !isAlphaNumeric(c) {
                        return false
                }
        }
        return true
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package parser

import (
        "errors"
        "fmt"
        "math"
        "os"
        "runtime"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/timestamp"
        "github.com/prometheus/prometheus/promql/parser/posrange"
        "github.com/prometheus/prometheus/util/strutil"
)

var parserPool = sync.Pool{
        New: func() interface{} {
                return &parser{}
        },
}

type Parser interface {
        ParseExpr() (Expr, error)
        Close()
}

type parser struct {
        lex Lexer

        inject    ItemType
        injecting bool

        // functions contains all functions supported by the parser instance.
        functions map[string]*Function

        // Everytime an Item is lexed that could be the end
        // of certain expressions its end position is stored here.
        lastClosing posrange.Pos

        yyParser yyParserImpl

        generatedParserResult interface{}
        parseErrors           ParseErrors
}

type Opt func(p *parser)

func WithFunctions(functions map[string]*Function) Opt {
        return func(p *parser) {
                p.functions = functions
        }
}

// NewParser returns a new parser.
func NewParser(input string, opts ...Opt) *parser { //nolint:revive // unexported-return.
        p := parserPool.Get().(*parser)

        p.functions = Functions
        p.injecting = false
        p.parseErrors = nil
        p.generatedParserResult = nil

        // Clear lexer struct before reusing.
        p.lex = Lexer{
                input: input,
                state: lexStatements,
        }

        // Apply user define options.
        for _, opt := range opts {
                opt(p)
        }

        return p
}

func (p *parser) ParseExpr() (expr Expr, err error) {
        defer p.recover(&err)

        parseResult := p.parseGenerated(START_EXPRESSION)

        if parseResult != nil {
                expr = parseResult.(Expr)
        }

        // Only typecheck when there are no syntax errors.
        if len(p.parseErrors) == 0 {
                p.checkAST(expr)
        }

        if len(p.parseErrors) != 0 {
                err = p.parseErrors
        }

        return expr, err
}

func (p *parser) Close() {
        defer parserPool.Put(p)
}

// ParseErr wraps a parsing error with line and position context.
type ParseErr struct {
        PositionRange posrange.PositionRange
        Err           error
        Query         string

        // LineOffset is an additional line offset to be added. Only used inside unit tests.
        LineOffset int
}

func (e *ParseErr) Error() string {
        return fmt.Sprintf("%s: parse error: %s", e.PositionRange.StartPosInput(e.Query, e.LineOffset), e.Err)
}

type ParseErrors []ParseErr

// Since producing multiple error messages might look weird when combined with error wrapping,
// only the first error produced by the parser is included in the error string.
// If getting the full error list is desired, it is recommended to typecast the error returned
// by the parser to ParseErrors and work with the underlying slice.
func (errs ParseErrors) Error() string {
        if len(errs) != 0 {
                return errs[0].Error()
        }
        // Should never happen
        // Panicking while printing an error seems like a bad idea, so the
        // situation is explained in the error message instead.
        return "error contains no error message"
}

// EnrichParseError enriches a single or list of parse errors (used for unit tests and promtool).
func EnrichParseError(err error, enrich func(parseErr *ParseErr)) {
        var parseErr *ParseErr
        if errors.As(err, &parseErr) {
                enrich(parseErr)
        }
        var parseErrors ParseErrors
        if errors.As(err, &parseErrors) {
                for i, e := range parseErrors {
                        enrich(&e)
                        parseErrors[i] = e
                }
        }
}

// ParseExpr returns the expression parsed from the input.
func ParseExpr(input string) (expr Expr, err error) {
        p := NewParser(input)
        defer p.Close()
        return p.ParseExpr()
}

// ParseMetric parses the input into a metric.
func ParseMetric(input string) (m labels.Labels, err error) {
        p := NewParser(input)
        defer p.Close()
        defer p.recover(&err)

        parseResult := p.parseGenerated(START_METRIC)
        if parseResult != nil {
                m = parseResult.(labels.Labels)
        }

        if len(p.parseErrors) != 0 {
                err = p.parseErrors
        }

        return m, err
}

// ParseMetricSelector parses the provided textual metric selector into a list of
// label matchers.
func ParseMetricSelector(input string) (m []*labels.Matcher, err error) {
        p := NewParser(input)
        defer p.Close()
        defer p.recover(&err)

        parseResult := p.parseGenerated(START_METRIC_SELECTOR)
        if parseResult != nil {
                m = parseResult.(*VectorSelector).LabelMatchers
        }

        if len(p.parseErrors) != 0 {
                err = p.parseErrors
        }

        return m, err
}

// ParseMetricSelectors parses a list of provided textual metric selectors into lists of
// label matchers.
func ParseMetricSelectors(matchers []string) (m [][]*labels.Matcher, err error) {
        var matcherSets [][]*labels.Matcher
        for _, s := range matchers {
                matchers, err := ParseMetricSelector(s)
                if err != nil {
                        return nil, err
                }
                matcherSets = append(matcherSets, matchers)
        }
        return matcherSets, nil
}

// SequenceValue is an omittable value in a sequence of time series values.
type SequenceValue struct {
        Value     float64
        Omitted   bool
        Histogram *histogram.FloatHistogram
}

func (v SequenceValue) String() string {
        if v.Omitted {
                return "_"
        }
        if v.Histogram != nil {
                return v.Histogram.String()
        }
        return fmt.Sprintf("%f", v.Value)
}

type seriesDescription struct {
        labels labels.Labels
        values []SequenceValue
}

// ParseSeriesDesc parses the description of a time series.
func ParseSeriesDesc(input string) (labels labels.Labels, values []SequenceValue, err error) {
        p := NewParser(input)
        p.lex.seriesDesc = true

        defer p.Close()
        defer p.recover(&err)

        parseResult := p.parseGenerated(START_SERIES_DESCRIPTION)
        if parseResult != nil {
                result := parseResult.(*seriesDescription)

                labels = result.labels
                values = result.values
        }

        if len(p.parseErrors) != 0 {
                err = p.parseErrors
        }

        return labels, values, err
}

// addParseErrf formats the error and appends it to the list of parsing errors.
func (p *parser) addParseErrf(positionRange posrange.PositionRange, format string, args ...interface{}) {
        p.addParseErr(positionRange, fmt.Errorf(format, args...))
}

// addParseErr appends the provided error to the list of parsing errors.
func (p *parser) addParseErr(positionRange posrange.PositionRange, err error) {
        perr := ParseErr{
                PositionRange: positionRange,
                Err:           err,
                Query:         p.lex.input,
        }

        p.parseErrors = append(p.parseErrors, perr)
}

func (p *parser) addSemanticError(err error) {
        p.addParseErr(p.yyParser.lval.item.PositionRange(), err)
}

// unexpected creates a parser error complaining about an unexpected lexer item.
// The item that is presented as unexpected is always the last item produced
// by the lexer.
func (p *parser) unexpected(context, expected string) {
        var errMsg strings.Builder

        // Do not report lexer errors twice
        if p.yyParser.lval.item.Typ == ERROR {
                return
        }

        errMsg.WriteString("unexpected ")
        errMsg.WriteString(p.yyParser.lval.item.desc())

        if context != "" {
                errMsg.WriteString(" in ")
                errMsg.WriteString(context)
        }

        if expected != "" {
                errMsg.WriteString(", expected ")
                errMsg.WriteString(expected)
        }

        p.addParseErr(p.yyParser.lval.item.PositionRange(), errors.New(errMsg.String()))
}

var errUnexpected = errors.New("unexpected error")

// recover is the handler that turns panics into returns from the top level of Parse.
func (p *parser) recover(errp *error) {
        e := recover()
        switch _, ok := e.(runtime.Error); {
        case ok:
                // Print the stack trace but do not inhibit the running application.
                buf := make([]byte, 64<<10)
                buf = buf[:runtime.Stack(buf, false)]

                fmt.Fprintf(os.Stderr, "parser panic: %v\n%s", e, buf)
                *errp = errUnexpected
        case e != nil:
                *errp = e.(error)
        }
}

// Lex is expected by the yyLexer interface of the yacc generated parser.
// It writes the next Item provided by the lexer to the provided pointer address.
// Comments are skipped.
//
// The yyLexer interface is currently implemented by the parser to allow
// the generated and non-generated parts to work together with regards to lookahead
// and error handling.
//
// For more information, see https://pkg.go.dev/golang.org/x/tools/cmd/goyacc.
func (p *parser) Lex(lval *yySymType) int {
        var typ ItemType

        if p.injecting {
                p.injecting = false
                return int(p.inject)
        }
        // Skip comments.
        for {
                p.lex.NextItem(&lval.item)
                typ = lval.item.Typ
                if typ != COMMENT {
                        break
                }
        }

        switch typ {
        case ERROR:
                pos := posrange.PositionRange{
                        Start: p.lex.start,
                        End:   posrange.Pos(len(p.lex.input)),
                }
                p.addParseErr(pos, errors.New(p.yyParser.lval.item.Val))

                // Tells yacc that this is the end of input.
                return 0
        case EOF:
                lval.item.Typ = EOF
                p.InjectItem(0)
        case RIGHT_BRACE, RIGHT_PAREN, RIGHT_BRACKET, DURATION, NUMBER:
                p.lastClosing = lval.item.Pos + posrange.Pos(len(lval.item.Val))
        }

        return int(typ)
}

// Error is expected by the yyLexer interface of the yacc generated parser.
//
// It is a no-op since the parsers error routines are triggered
// by mechanisms that allow more fine-grained control
// For more information, see https://pkg.go.dev/golang.org/x/tools/cmd/goyacc.
func (p *parser) Error(string) {
}

// InjectItem allows injecting a single Item at the beginning of the token stream
// consumed by the generated parser.
// This allows having multiple start symbols as described in
// https://www.gnu.org/software/bison/manual/html_node/Multiple-start_002dsymbols.html .
// Only the Lex function used by the generated parser is affected by this injected Item.
// Trying to inject when a previously injected Item has not yet been consumed will panic.
// Only Item types that are supposed to be used as start symbols are allowed as an argument.
func (p *parser) InjectItem(typ ItemType) {
        if p.injecting {
                panic("cannot inject multiple Items into the token stream")
        }

        if typ != 0 && (typ <= startSymbolsStart || typ >= startSymbolsEnd) {
                panic("cannot inject symbol that isn't start symbol")
        }

        p.inject = typ
        p.injecting = true
}

func (p *parser) newBinaryExpression(lhs Node, op Item, modifiers, rhs Node) *BinaryExpr {
        ret := modifiers.(*BinaryExpr)

        ret.LHS = lhs.(Expr)
        ret.RHS = rhs.(Expr)
        ret.Op = op.Typ

        return ret
}

func (p *parser) assembleVectorSelector(vs *VectorSelector) {
        // If the metric name was set outside the braces, add a matcher for it.
        // If the metric name was inside the braces we don't need to do anything.
        if vs.Name != "" {
                nameMatcher, err := labels.NewMatcher(labels.MatchEqual, labels.MetricName, vs.Name)
                if err != nil {
                        panic(err) // Must not happen with labels.MatchEqual
                }
                vs.LabelMatchers = append(vs.LabelMatchers, nameMatcher)
        }
}

func (p *parser) newAggregateExpr(op Item, modifier, args Node) (ret *AggregateExpr) {
        ret = modifier.(*AggregateExpr)
        arguments := args.(Expressions)

        ret.PosRange = posrange.PositionRange{
                Start: op.Pos,
                End:   p.lastClosing,
        }

        ret.Op = op.Typ

        if len(arguments) == 0 {
                p.addParseErrf(ret.PositionRange(), "no arguments for aggregate expression provided")

                // Prevents invalid array accesses.
                return
        }

        desiredArgs := 1
        if ret.Op.IsAggregatorWithParam() {
                desiredArgs = 2

                ret.Param = arguments[0]
        }

        if len(arguments) != desiredArgs {
                p.addParseErrf(ret.PositionRange(), "wrong number of arguments for aggregate expression provided, expected %d, got %d", desiredArgs, len(arguments))
                return
        }

        ret.Expr = arguments[desiredArgs-1]

        return ret
}

// newMap is used when building the FloatHistogram from a map.
func (p *parser) newMap() (ret map[string]interface{}) {
        return map[string]interface{}{}
}

// mergeMaps is used to combine maps as they're used to later build the Float histogram.
// This will merge the right map into the left map.
func (p *parser) mergeMaps(left, right *map[string]interface{}) (ret *map[string]interface{}) {
        for key, value := range *right {
                if _, ok := (*left)[key]; ok {
                        p.addParseErrf(posrange.PositionRange{}, "duplicate key \"%s\" in histogram", key)
                        continue
                }
                (*left)[key] = value
        }
        return left
}

func (p *parser) histogramsIncreaseSeries(base, inc *histogram.FloatHistogram, times uint64) ([]SequenceValue, error) {
        return p.histogramsSeries(base, inc, times, func(a, b *histogram.FloatHistogram) (*histogram.FloatHistogram, error) {
                return a.Add(b)
        })
}

func (p *parser) histogramsDecreaseSeries(base, inc *histogram.FloatHistogram, times uint64) ([]SequenceValue, error) {
        return p.histogramsSeries(base, inc, times, func(a, b *histogram.FloatHistogram) (*histogram.FloatHistogram, error) {
                return a.Sub(b)
        })
}

func (p *parser) histogramsSeries(base, inc *histogram.FloatHistogram, times uint64,
        combine func(*histogram.FloatHistogram, *histogram.FloatHistogram) (*histogram.FloatHistogram, error),
) ([]SequenceValue, error) {
        ret := make([]SequenceValue, times+1)
        // Add an additional value (the base) for time 0, which we ignore in tests.
        ret[0] = SequenceValue{Histogram: base}
        cur := base
        for i := uint64(1); i <= times; i++ {
                if cur.Schema > inc.Schema {
                        return nil, fmt.Errorf("error combining histograms: cannot merge from schema %d to %d", inc.Schema, cur.Schema)
                }

                var err error
                cur, err = combine(cur.Copy(), inc)
                if err != nil {
                        return ret, err
                }
                ret[i] = SequenceValue{Histogram: cur}
        }

        return ret, nil
}

// buildHistogramFromMap is used in the grammar to take then individual parts of the histogram and complete it.
func (p *parser) buildHistogramFromMap(desc *map[string]interface{}) *histogram.FloatHistogram {
        output := &histogram.FloatHistogram{}

        val, ok := (*desc)["schema"]
        if ok {
                schema, ok := val.(int64)
                if ok {
                        output.Schema = int32(schema)
                } else {
                        p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error parsing schema number: %v", val)
                }
        }

        val, ok = (*desc)["sum"]
        if ok {
                sum, ok := val.(float64)
                if ok {
                        output.Sum = sum
                } else {
                        p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error parsing sum number: %v", val)
                }
        }
        val, ok = (*desc)["count"]
        if ok {
                count, ok := val.(float64)
                if ok {
                        output.Count = count
                } else {
                        p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error parsing count number: %v", val)
                }
        }

        val, ok = (*desc)["z_bucket"]
        if ok {
                bucket, ok := val.(float64)
                if ok {
                        output.ZeroCount = bucket
                } else {
                        p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error parsing z_bucket number: %v", val)
                }
        }
        val, ok = (*desc)["z_bucket_w"]
        if ok {
                bucketWidth, ok := val.(float64)
                if ok {
                        output.ZeroThreshold = bucketWidth
                } else {
                        p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error parsing z_bucket_w number: %v", val)
                }
        }
        val, ok = (*desc)["custom_values"]
        if ok {
                customValues, ok := val.([]float64)
                if ok {
                        output.CustomValues = customValues
                } else {
                        p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error parsing custom_values: %v", val)
                }
        }

        buckets, spans := p.buildHistogramBucketsAndSpans(desc, "buckets", "offset")
        output.PositiveBuckets = buckets
        output.PositiveSpans = spans

        buckets, spans = p.buildHistogramBucketsAndSpans(desc, "n_buckets", "n_offset")
        output.NegativeBuckets = buckets
        output.NegativeSpans = spans

        return output
}

func (p *parser) buildHistogramBucketsAndSpans(desc *map[string]interface{}, bucketsKey, offsetKey string,
) (buckets []float64, spans []histogram.Span) {
        bucketCount := 0
        val, ok := (*desc)[bucketsKey]
        if ok {
                val, ok := val.([]float64)
                if ok {
                        buckets = val
                        bucketCount = len(buckets)
                } else {
                        p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error parsing %s float array: %v", bucketsKey, val)
                }
        }
        offset := int32(0)
        val, ok = (*desc)[offsetKey]
        if ok {
                val, ok := val.(int64)
                if ok {
                        offset = int32(val)
                } else {
                        p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error parsing %s number: %v", offsetKey, val)
                }
        }
        if bucketCount > 0 {
                spans = []histogram.Span{{Offset: offset, Length: uint32(bucketCount)}}
        }
        return
}

// number parses a number.
func (p *parser) number(val string) float64 {
        n, err := strconv.ParseInt(val, 0, 64)
        f := float64(n)
        if err != nil {
                f, err = strconv.ParseFloat(val, 64)
        }
        if err != nil {
                p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error parsing number: %s", err)
        }
        return f
}

// expectType checks the type of the node and raises an error if it
// is not of the expected type.
func (p *parser) expectType(node Node, want ValueType, context string) {
        t := p.checkAST(node)
        if t != want {
                p.addParseErrf(node.PositionRange(), "expected type %s in %s, got %s", DocumentedType(want), context, DocumentedType(t))
        }
}

// checkAST checks the validity of the provided AST. This includes type checking.
func (p *parser) checkAST(node Node) (typ ValueType) {
        // For expressions the type is determined by their Type function.
        // Lists do not have a type but are not invalid either.
        switch n := node.(type) {
        case Expressions:
                typ = ValueTypeNone
        case Expr:
                typ = n.Type()
        default:
                p.addParseErrf(node.PositionRange(), "unknown node type: %T", node)
        }

        // Recursively check correct typing for child nodes and raise
        // errors in case of bad typing.
        switch n := node.(type) {
        case *EvalStmt:
                ty := p.checkAST(n.Expr)
                if ty == ValueTypeNone {
                        p.addParseErrf(n.Expr.PositionRange(), "evaluation statement must have a valid expression type but got %s", DocumentedType(ty))
                }

        case Expressions:
                for _, e := range n {
                        ty := p.checkAST(e)
                        if ty == ValueTypeNone {
                                p.addParseErrf(e.PositionRange(), "expression must have a valid expression type but got %s", DocumentedType(ty))
                        }
                }
        case *AggregateExpr:
                if !n.Op.IsAggregator() {
                        p.addParseErrf(n.PositionRange(), "aggregation operator expected in aggregation expression but got %q", n.Op)
                }
                p.expectType(n.Expr, ValueTypeVector, "aggregation expression")
                if n.Op == TOPK || n.Op == BOTTOMK || n.Op == QUANTILE {
                        p.expectType(n.Param, ValueTypeScalar, "aggregation parameter")
                }
                if n.Op == COUNT_VALUES {
                        p.expectType(n.Param, ValueTypeString, "aggregation parameter")
                }

        case *BinaryExpr:
                lt := p.checkAST(n.LHS)
                rt := p.checkAST(n.RHS)

                // opRange returns the PositionRange of the operator part of the BinaryExpr.
                // This is made a function instead of a variable, so it is lazily evaluated on demand.
                opRange := func() (r posrange.PositionRange) {
                        // Remove whitespace at the beginning and end of the range.
                        for r.Start = n.LHS.PositionRange().End; isSpace(rune(p.lex.input[r.Start])); r.Start++ {
                        }
                        for r.End = n.RHS.PositionRange().Start - 1; isSpace(rune(p.lex.input[r.End])); r.End-- {
                        }
                        return
                }

                if n.ReturnBool && !n.Op.IsComparisonOperator() {
                        p.addParseErrf(opRange(), "bool modifier can only be used on comparison operators")
                }

                if n.Op.IsComparisonOperator() && !n.ReturnBool && n.RHS.Type() == ValueTypeScalar && n.LHS.Type() == ValueTypeScalar {
                        p.addParseErrf(opRange(), "comparisons between scalars must use BOOL modifier")
                }

                if n.Op.IsSetOperator() && n.VectorMatching.Card == CardOneToOne {
                        n.VectorMatching.Card = CardManyToMany
                }

                for _, l1 := range n.VectorMatching.MatchingLabels {
                        for _, l2 := range n.VectorMatching.Include {
                                if l1 == l2 && n.VectorMatching.On {
                                        p.addParseErrf(opRange(), "label %q must not occur in ON and GROUP clause at once", l1)
                                }
                        }
                }

                if !n.Op.IsOperator() {
                        p.addParseErrf(n.PositionRange(), "binary expression does not support operator %q", n.Op)
                }
                if lt != ValueTypeScalar && lt != ValueTypeVector {
                        p.addParseErrf(n.LHS.PositionRange(), "binary expression must contain only scalar and instant vector types")
                }
                if rt != ValueTypeScalar && rt != ValueTypeVector {
                        p.addParseErrf(n.RHS.PositionRange(), "binary expression must contain only scalar and instant vector types")
                }

                switch {
                case (lt != ValueTypeVector || rt != ValueTypeVector) && n.VectorMatching != nil:
                        if len(n.VectorMatching.MatchingLabels) > 0 {
                                p.addParseErrf(n.PositionRange(), "vector matching only allowed between instant vectors")
                        }
                        n.VectorMatching = nil
                case n.Op.IsSetOperator(): // Both operands are Vectors.
                        if n.VectorMatching.Card == CardOneToMany || n.VectorMatching.Card == CardManyToOne {
                                p.addParseErrf(n.PositionRange(), "no grouping allowed for %q operation", n.Op)
                        }
                        if n.VectorMatching.Card != CardManyToMany {
                                p.addParseErrf(n.PositionRange(), "set operations must always be many-to-many")
                        }
                }

                if (lt == ValueTypeScalar || rt == ValueTypeScalar) && n.Op.IsSetOperator() {
                        p.addParseErrf(n.PositionRange(), "set operator %q not allowed in binary scalar expression", n.Op)
                }

        case *Call:
                nargs := len(n.Func.ArgTypes)
                if n.Func.Variadic == 0 {
                        if nargs != len(n.Args) {
                                p.addParseErrf(n.PositionRange(), "expected %d argument(s) in call to %q, got %d", nargs, n.Func.Name, len(n.Args))
                        }
                } else {
                        na := nargs - 1
                        if na > len(n.Args) {
                                p.addParseErrf(n.PositionRange(), "expected at least %d argument(s) in call to %q, got %d", na, n.Func.Name, len(n.Args))
                        } else if nargsmax := na + n.Func.Variadic; n.Func.Variadic > 0 && nargsmax < len(n.Args) {
                                p.addParseErrf(n.PositionRange(), "expected at most %d argument(s) in call to %q, got %d", nargsmax, n.Func.Name, len(n.Args))
                        }
                }

                for i, arg := range n.Args {
                        if i >= len(n.Func.ArgTypes) {
                                if n.Func.Variadic == 0 {
                                        // This is not a vararg function so we should not check the
                                        // type of the extra arguments.
                                        break
                                }
                                i = len(n.Func.ArgTypes) - 1
                        }
                        p.expectType(arg, n.Func.ArgTypes[i], fmt.Sprintf("call to function %q", n.Func.Name))
                }

        case *ParenExpr:
                p.checkAST(n.Expr)

        case *UnaryExpr:
                if n.Op != ADD && n.Op != SUB {
                        p.addParseErrf(n.PositionRange(), "only + and - operators allowed for unary expressions")
                }
                if t := p.checkAST(n.Expr); t != ValueTypeScalar && t != ValueTypeVector {
                        p.addParseErrf(n.PositionRange(), "unary expression only allowed on expressions of type scalar or instant vector, got %q", DocumentedType(t))
                }

        case *SubqueryExpr:
                ty := p.checkAST(n.Expr)
                if ty != ValueTypeVector {
                        p.addParseErrf(n.PositionRange(), "subquery is only allowed on instant vector, got %s instead", ty)
                }
        case *MatrixSelector:
                p.checkAST(n.VectorSelector)

        case *VectorSelector:
                if n.Name != "" {
                        // In this case the last LabelMatcher is checking for the metric name
                        // set outside the braces. This checks if the name has already been set
                        // previously.
                        for _, m := range n.LabelMatchers[0 : len(n.LabelMatchers)-1] {
                                if m != nil && m.Name == labels.MetricName {
                                        p.addParseErrf(n.PositionRange(), "metric name must not be set twice: %q or %q", n.Name, m.Value)
                                }
                        }

                        // Skip the check for non-empty matchers because an explicit
                        // metric name is a non-empty matcher.
                        break
                }
                // A Vector selector must contain at least one non-empty matcher to prevent
                // implicit selection of all metrics (e.g. by a typo).
                notEmpty := false
                for _, lm := range n.LabelMatchers {
                        if lm != nil && !lm.Matches("") {
                                notEmpty = true
                                break
                        }
                }
                if !notEmpty {
                        p.addParseErrf(n.PositionRange(), "vector selector must contain at least one non-empty matcher")
                }

        case *NumberLiteral, *StringLiteral:
                // Nothing to do for terminals.

        default:
                p.addParseErrf(n.PositionRange(), "unknown node type: %T", node)
        }
        return
}

func (p *parser) unquoteString(s string) string {
        unquoted, err := strutil.Unquote(s)
        if err != nil {
                p.addParseErrf(p.yyParser.lval.item.PositionRange(), "error unquoting string %q: %s", s, err)
        }
        return unquoted
}

func parseDuration(ds string) (time.Duration, error) {
        dur, err := model.ParseDuration(ds)
        if err != nil {
                return 0, err
        }
        if dur == 0 {
                return 0, errors.New("duration must be greater than 0")
        }
        return time.Duration(dur), nil
}

// parseGenerated invokes the yacc generated parser.
// The generated parser gets the provided startSymbol injected into
// the lexer stream, based on which grammar will be used.
func (p *parser) parseGenerated(startSymbol ItemType) interface{} {
        p.InjectItem(startSymbol)

        p.yyParser.Parse(p)

        return p.generatedParserResult
}

func (p *parser) newLabelMatcher(label, operator, value Item) *labels.Matcher {
        op := operator.Typ
        val := p.unquoteString(value.Val)

        // Map the Item to the respective match type.
        var matchType labels.MatchType
        switch op {
        case EQL:
                matchType = labels.MatchEqual
        case NEQ:
                matchType = labels.MatchNotEqual
        case EQL_REGEX:
                matchType = labels.MatchRegexp
        case NEQ_REGEX:
                matchType = labels.MatchNotRegexp
        default:
                // This should never happen, since the error should have been caught
                // by the generated parser.
                panic("invalid operator")
        }

        m, err := labels.NewMatcher(matchType, label.Val, val)
        if err != nil {
                p.addParseErr(mergeRanges(&label, &value), err)
        }

        return m
}

func (p *parser) newMetricNameMatcher(value Item) *labels.Matcher {
        m, err := labels.NewMatcher(labels.MatchEqual, labels.MetricName, value.Val)
        if err != nil {
                p.addParseErr(value.PositionRange(), err)
        }

        return m
}

// addOffset is used to set the offset in the generated parser.
func (p *parser) addOffset(e Node, offset time.Duration) {
        var orgoffsetp *time.Duration
        var endPosp *posrange.Pos

        switch s := e.(type) {
        case *VectorSelector:
                orgoffsetp = &s.OriginalOffset
                endPosp = &s.PosRange.End
        case *MatrixSelector:
                vs, ok := s.VectorSelector.(*VectorSelector)
                if !ok {
                        p.addParseErrf(e.PositionRange(), "ranges only allowed for vector selectors")
                        return
                }
                orgoffsetp = &vs.OriginalOffset
                endPosp = &s.EndPos
        case *SubqueryExpr:
                orgoffsetp = &s.OriginalOffset
                endPosp = &s.EndPos
        default:
                p.addParseErrf(e.PositionRange(), "offset modifier must be preceded by an instant vector selector or range vector selector or a subquery")
                return
        }

        // it is already ensured by parseDuration func that there never will be a zero offset modifier
        switch {
        case *orgoffsetp != 0:
                p.addParseErrf(e.PositionRange(), "offset may not be set multiple times")
        case orgoffsetp != nil:
                *orgoffsetp = offset
        }

        *endPosp = p.lastClosing
}

// setTimestamp is used to set the timestamp from the @ modifier in the generated parser.
func (p *parser) setTimestamp(e Node, ts float64) {
        if math.IsInf(ts, -1) || math.IsInf(ts, 1) || math.IsNaN(ts) ||
                ts >= float64(math.MaxInt64) || ts <= float64(math.MinInt64) {
                p.addParseErrf(e.PositionRange(), "timestamp out of bounds for @ modifier: %f", ts)
        }
        var timestampp **int64
        var endPosp *posrange.Pos

        timestampp, _, endPosp, ok := p.getAtModifierVars(e)
        if !ok {
                return
        }

        if timestampp != nil {
                *timestampp = new(int64)
                **timestampp = timestamp.FromFloatSeconds(ts)
        }

        *endPosp = p.lastClosing
}

// setAtModifierPreprocessor is used to set the preprocessor for the @ modifier.
func (p *parser) setAtModifierPreprocessor(e Node, op Item) {
        _, preprocp, endPosp, ok := p.getAtModifierVars(e)
        if !ok {
                return
        }

        if preprocp != nil {
                *preprocp = op.Typ
        }

        *endPosp = p.lastClosing
}

func (p *parser) getAtModifierVars(e Node) (**int64, *ItemType, *posrange.Pos, bool) {
        var (
                timestampp **int64
                preprocp   *ItemType
                endPosp    *posrange.Pos
        )
        switch s := e.(type) {
        case *VectorSelector:
                timestampp = &s.Timestamp
                preprocp = &s.StartOrEnd
                endPosp = &s.PosRange.End
        case *MatrixSelector:
                vs, ok := s.VectorSelector.(*VectorSelector)
                if !ok {
                        p.addParseErrf(e.PositionRange(), "ranges only allowed for vector selectors")
                        return nil, nil, nil, false
                }
                preprocp = &vs.StartOrEnd
                timestampp = &vs.Timestamp
                endPosp = &s.EndPos
        case *SubqueryExpr:
                preprocp = &s.StartOrEnd
                timestampp = &s.Timestamp
                endPosp = &s.EndPos
        default:
                p.addParseErrf(e.PositionRange(), "@ modifier must be preceded by an instant vector selector or range vector selector or a subquery")
                return nil, nil, nil, false
        }

        if *timestampp != nil || (*preprocp) == START || (*preprocp) == END {
                p.addParseErrf(e.PositionRange(), "@ <timestamp> may not be set multiple times")
                return nil, nil, nil, false
        }

        return timestampp, preprocp, endPosp, true
}

func MustLabelMatcher(mt labels.MatchType, name, val string) *labels.Matcher {
        m, err := labels.NewMatcher(mt, name, val)
        if err != nil {
                panic(err)
        }
        return m
}

func MustGetFunction(name string) *Function {
        f, ok := getFunction(name, Functions)
        if !ok {
                panic(fmt.Errorf("function %q does not exist", name))
        }
        return f
}

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// posrange is used to report a position in query strings for error
// and warning messages.
package posrange

import "fmt"

// Pos is the position in a string.
// Negative numbers indicate undefined positions.
type Pos int

// PositionRange describes a position in the input string of the parser.
type PositionRange struct {
        Start Pos
        End   Pos
}

// StartPosInput uses the query string to convert the PositionRange into a
// line:col string, indicating when this is not possible if the query is empty
// or the position is invalid. When this is used to convert ParseErr to a string,
// lineOffset is an additional line offset to be added, and is only used inside
// unit tests.
func (p PositionRange) StartPosInput(query string, lineOffset int) string {
        if query == "" {
                return "unknown position"
        }
        pos := int(p.Start)
        if pos < 0 || pos > len(query) {
                return "invalid position"
        }

        lastLineBreak := -1
        line := lineOffset + 1
        for i, c := range query[:pos] {
                if c == '\n' {
                        lastLineBreak = i
                        line++
                }
        }
        col := pos - lastLineBreak
        return fmt.Sprintf("%d:%d", line, col)
}

// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package parser

import (
        "fmt"
        "strings"
)

// Approach
// --------
// When a PromQL query is parsed, it is converted into PromQL AST,
// which is a nested structure of nodes. Each node has a depth/level
// (distance from the root), that is passed by its parent.
//
// While prettifying, a Node considers 2 things:
// 1. Did the current Node's parent add a new line?
// 2. Does the current Node needs to be prettified?
//
// The level of a Node determines if it should be indented or not.
// The answer to the 1 is NO if the level passed is 0. This means, the
// parent Node did not apply a new line, so the current Node must not
// apply any indentation as prefix.
// If level > 1, a new line is applied by the parent. So, the current Node
// should prefix an indentation before writing any of its content. This indentation
// will be ([level/depth of current Node] * "  ").
//
// The answer to 2 is YES if the normalized length of the current Node exceeds
// the maxCharactersPerLine limit. Hence, it applies the indentation equal to
// its depth and increments the level by 1 before passing down the child.
// If the answer is NO, the current Node returns the normalized string value of itself.

var maxCharactersPerLine = 100

func Prettify(n Node) string {
        return n.Pretty(0)
}

func (e *AggregateExpr) Pretty(level int) string {
        s := indent(level)
        if !needsSplit(e) {
                s += e.String()
                return s
        }

        s += e.getAggOpStr()
        s += "(\n"

        if e.Op.IsAggregatorWithParam() {
                s += fmt.Sprintf("%s,\n", e.Param.Pretty(level+1))
        }
        s += fmt.Sprintf("%s\n%s)", e.Expr.Pretty(level+1), indent(level))
        return s
}

func (e *BinaryExpr) Pretty(level int) string {
        s := indent(level)
        if !needsSplit(e) {
                s += e.String()
                return s
        }
        returnBool := ""
        if e.ReturnBool {
                returnBool = " bool"
        }

        matching := e.getMatchingStr()
        return fmt.Sprintf("%s\n%s%s%s%s\n%s", e.LHS.Pretty(level+1), indent(level), e.Op, returnBool, matching, e.RHS.Pretty(level+1))
}

func (e *Call) Pretty(level int) string {
        s := indent(level)
        if !needsSplit(e) {
                s += e.String()
                return s
        }
        s += fmt.Sprintf("%s(\n%s\n%s)", e.Func.Name, e.Args.Pretty(level+1), indent(level))
        return s
}

func (e *EvalStmt) Pretty(_ int) string {
        return "EVAL " + e.Expr.String()
}

func (e Expressions) Pretty(level int) string {
        // Do not prefix the indent since respective nodes will indent itself.
        s := ""
        for i := range e {
                s += fmt.Sprintf("%s,\n", e[i].Pretty(level))
        }
        return s[:len(s)-2]
}

func (e *ParenExpr) Pretty(level int) string {
        s := indent(level)
        if !needsSplit(e) {
                s += e.String()
                return s
        }
        return fmt.Sprintf("%s(\n%s\n%s)", s, e.Expr.Pretty(level+1), indent(level))
}

func (e *StepInvariantExpr) Pretty(level int) string {
        return e.Expr.Pretty(level)
}

func (e *MatrixSelector) Pretty(level int) string {
        return getCommonPrefixIndent(level, e)
}

func (e *SubqueryExpr) Pretty(level int) string {
        if !needsSplit(e) {
                return e.String()
        }
        return fmt.Sprintf("%s%s", e.Expr.Pretty(level), e.getSubqueryTimeSuffix())
}

func (e *VectorSelector) Pretty(level int) string {
        return getCommonPrefixIndent(level, e)
}

func (e *NumberLiteral) Pretty(level int) string {
        return getCommonPrefixIndent(level, e)
}

func (e *StringLiteral) Pretty(level int) string {
        return getCommonPrefixIndent(level, e)
}

func (e *UnaryExpr) Pretty(level int) string {
        child := e.Expr.Pretty(level)
        // Remove the indent prefix from child since we attach the prefix indent before Op.
        child = strings.TrimSpace(child)
        return fmt.Sprintf("%s%s%s", indent(level), e.Op, child)
}

func getCommonPrefixIndent(level int, current Node) string {
        return fmt.Sprintf("%s%s", indent(level), current.String())
}

// needsSplit normalizes the node and then checks if the node needs any split.
// This is necessary to remove any trailing whitespaces.
func needsSplit(n Node) bool {
        if n == nil {
                return false
        }
        return len(n.String()) > maxCharactersPerLine
}

const indentString = "  "

// indent adds the indentString n number of times.
func indent(n int) string {
        return strings.Repeat(indentString, n)
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package parser

import (
        "fmt"
        "sort"
        "strings"
        "time"

        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/labels"
)

// Tree returns a string of the tree structure of the given node.
func Tree(node Node) string {
        return tree(node, "")
}

func tree(node Node, level string) string {
        if node == nil {
                return fmt.Sprintf("%s |---- %T\n", level, node)
        }
        typs := strings.Split(fmt.Sprintf("%T", node), ".")[1]

        t := fmt.Sprintf("%s |---- %s :: %s\n", level, typs, node)

        level += " · · ·"

        for _, e := range Children(node) {
                t += tree(e, level)
        }

        return t
}

func (node *EvalStmt) String() string {
        return "EVAL " + node.Expr.String()
}

func (es Expressions) String() (s string) {
        if len(es) == 0 {
                return ""
        }
        for _, e := range es {
                s += e.String()
                s += ", "
        }
        return s[:len(s)-2]
}

func (node *AggregateExpr) String() string {
        aggrString := node.getAggOpStr()
        aggrString += "("
        if node.Op.IsAggregatorWithParam() {
                aggrString += fmt.Sprintf("%s, ", node.Param)
        }
        aggrString += fmt.Sprintf("%s)", node.Expr)

        return aggrString
}

func (node *AggregateExpr) getAggOpStr() string {
        aggrString := node.Op.String()

        switch {
        case node.Without:
                aggrString += fmt.Sprintf(" without (%s) ", strings.Join(node.Grouping, ", "))
        case len(node.Grouping) > 0:
                aggrString += fmt.Sprintf(" by (%s) ", strings.Join(node.Grouping, ", "))
        }

        return aggrString
}

func (node *BinaryExpr) String() string {
        returnBool := ""
        if node.ReturnBool {
                returnBool = " bool"
        }

        matching := node.getMatchingStr()
        return fmt.Sprintf("%s %s%s%s %s", node.LHS, node.Op, returnBool, matching, node.RHS)
}

func (node *BinaryExpr) getMatchingStr() string {
        matching := ""
        vm := node.VectorMatching
        if vm != nil && (len(vm.MatchingLabels) > 0 || vm.On) {
                vmTag := "ignoring"
                if vm.On {
                        vmTag = "on"
                }
                matching = fmt.Sprintf(" %s (%s)", vmTag, strings.Join(vm.MatchingLabels, ", "))

                if vm.Card == CardManyToOne || vm.Card == CardOneToMany {
                        vmCard := "right"
                        if vm.Card == CardManyToOne {
                                vmCard = "left"
                        }
                        matching += fmt.Sprintf(" group_%s (%s)", vmCard, strings.Join(vm.Include, ", "))
                }
        }
        return matching
}

func (node *Call) String() string {
        return fmt.Sprintf("%s(%s)", node.Func.Name, node.Args)
}

func (node *MatrixSelector) String() string {
        // Copy the Vector selector before changing the offset
        vecSelector := *node.VectorSelector.(*VectorSelector)
        offset := ""
        switch {
        case vecSelector.OriginalOffset > time.Duration(0):
                offset = fmt.Sprintf(" offset %s", model.Duration(vecSelector.OriginalOffset))
        case vecSelector.OriginalOffset < time.Duration(0):
                offset = fmt.Sprintf(" offset -%s", model.Duration(-vecSelector.OriginalOffset))
        }
        at := ""
        switch {
        case vecSelector.Timestamp != nil:
                at = fmt.Sprintf(" @ %.3f", float64(*vecSelector.Timestamp)/1000.0)
        case vecSelector.StartOrEnd == START:
                at = " @ start()"
        case vecSelector.StartOrEnd == END:
                at = " @ end()"
        }

        // Do not print the @ and offset twice.
        offsetVal, atVal, preproc := vecSelector.OriginalOffset, vecSelector.Timestamp, vecSelector.StartOrEnd
        vecSelector.OriginalOffset = 0
        vecSelector.Timestamp = nil
        vecSelector.StartOrEnd = 0

        str := fmt.Sprintf("%s[%s]%s%s", vecSelector.String(), model.Duration(node.Range), at, offset)

        vecSelector.OriginalOffset, vecSelector.Timestamp, vecSelector.StartOrEnd = offsetVal, atVal, preproc

        return str
}

func (node *SubqueryExpr) String() string {
        return fmt.Sprintf("%s%s", node.Expr.String(), node.getSubqueryTimeSuffix())
}

// getSubqueryTimeSuffix returns the '[<range>:<step>] @ <timestamp> offset <offset>' suffix of the subquery.
func (node *SubqueryExpr) getSubqueryTimeSuffix() string {
        step := ""
        if node.Step != 0 {
                step = model.Duration(node.Step).String()
        }
        offset := ""
        switch {
        case node.OriginalOffset > time.Duration(0):
                offset = fmt.Sprintf(" offset %s", model.Duration(node.OriginalOffset))
        case node.OriginalOffset < time.Duration(0):
                offset = fmt.Sprintf(" offset -%s", model.Duration(-node.OriginalOffset))
        }
        at := ""
        switch {
        case node.Timestamp != nil:
                at = fmt.Sprintf(" @ %.3f", float64(*node.Timestamp)/1000.0)
        case node.StartOrEnd == START:
                at = " @ start()"
        case node.StartOrEnd == END:
                at = " @ end()"
        }
        return fmt.Sprintf("[%s:%s]%s%s", model.Duration(node.Range), step, at, offset)
}

func (node *NumberLiteral) String() string {
        return fmt.Sprint(node.Val)
}

func (node *ParenExpr) String() string {
        return fmt.Sprintf("(%s)", node.Expr)
}

func (node *StringLiteral) String() string {
        return fmt.Sprintf("%q", node.Val)
}

func (node *UnaryExpr) String() string {
        return fmt.Sprintf("%s%s", node.Op, node.Expr)
}

func (node *VectorSelector) String() string {
        var labelStrings []string
        if len(node.LabelMatchers) > 1 {
                labelStrings = make([]string, 0, len(node.LabelMatchers)-1)
        }
        for _, matcher := range node.LabelMatchers {
                // Only include the __name__ label if its equality matching and matches the name, but don't skip if it's an explicit empty name matcher.
                if matcher.Name == labels.MetricName && matcher.Type == labels.MatchEqual && matcher.Value == node.Name && matcher.Value != "" {
                        continue
                }
                labelStrings = append(labelStrings, matcher.String())
        }
        offset := ""
        switch {
        case node.OriginalOffset > time.Duration(0):
                offset = fmt.Sprintf(" offset %s", model.Duration(node.OriginalOffset))
        case node.OriginalOffset < time.Duration(0):
                offset = fmt.Sprintf(" offset -%s", model.Duration(-node.OriginalOffset))
        }
        at := ""
        switch {
        case node.Timestamp != nil:
                at = fmt.Sprintf(" @ %.3f", float64(*node.Timestamp)/1000.0)
        case node.StartOrEnd == START:
                at = " @ start()"
        case node.StartOrEnd == END:
                at = " @ end()"
        }

        if len(labelStrings) == 0 {
                return fmt.Sprintf("%s%s%s", node.Name, at, offset)
        }
        sort.Strings(labelStrings)
        return fmt.Sprintf("%s{%s}%s%s", node.Name, strings.Join(labelStrings, ","), at, offset)
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package parser

// Value is a generic interface for values resulting from a query evaluation.
type Value interface {
        Type() ValueType
        String() string
}

// ValueType describes a type of a value.
type ValueType string

// The valid value types.
const (
        ValueTypeNone   ValueType = "none"
        ValueTypeVector ValueType = "vector"
        ValueTypeScalar ValueType = "scalar"
        ValueTypeMatrix ValueType = "matrix"
        ValueTypeString ValueType = "string"
)

// DocumentedType returns the internal type to the equivalent
// user facing terminology as defined in the documentation.
func DocumentedType(t ValueType) string {
        switch t {
        case ValueTypeVector:
                return "instant vector"
        case ValueTypeMatrix:
                return "range vector"
        default:
                return string(t)
        }
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package promqltest

import (
        "context"
        "embed"
        "errors"
        "fmt"
        "io/fs"
        "math"
        "sort"
        "strconv"
        "strings"
        "testing"
        "time"

        "github.com/grafana/regexp"
        "github.com/prometheus/common/model"
        "github.com/stretchr/testify/require"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/timestamp"
        "github.com/prometheus/prometheus/promql"
        "github.com/prometheus/prometheus/promql/parser"
        "github.com/prometheus/prometheus/promql/parser/posrange"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/util/almost"
        "github.com/prometheus/prometheus/util/teststorage"
        "github.com/prometheus/prometheus/util/testutil"
)

var (
        patSpace       = regexp.MustCompile("[\t ]+")
        patLoad        = regexp.MustCompile(`^load(?:_(with_nhcb))?\s+(.+?)$`)
        patEvalInstant = regexp.MustCompile(`^eval(?:_(fail|warn|ordered))?\s+instant\s+(?:at\s+(.+?))?\s+(.+)$`)
        patEvalRange   = regexp.MustCompile(`^eval(?:_(fail|warn))?\s+range\s+from\s+(.+)\s+to\s+(.+)\s+step\s+(.+?)\s+(.+)$`)
)

const (
        defaultEpsilon            = 0.000001 // Relative error allowed for sample values.
        DefaultMaxSamplesPerQuery = 10000
)

var testStartTime = time.Unix(0, 0).UTC()

// LoadedStorage returns storage with generated data using the provided load statements.
// Non-load statements will cause test errors.
func LoadedStorage(t testutil.T, input string) *teststorage.TestStorage {
        test, err := newTest(t, input)
        require.NoError(t, err)

        for _, cmd := range test.cmds {
                switch cmd.(type) {
                case *loadCmd:
                        require.NoError(t, test.exec(cmd, nil))
                default:
                        t.Errorf("only 'load' commands accepted, got '%s'", cmd)
                }
        }
        return test.storage
}

func NewTestEngine(enablePerStepStats bool, lookbackDelta time.Duration, maxSamples int) *promql.Engine {
        return promql.NewEngine(promql.EngineOpts{
                Logger:                   nil,
                Reg:                      nil,
                MaxSamples:               maxSamples,
                Timeout:                  100 * time.Second,
                NoStepSubqueryIntervalFn: func(int64) int64 { return durationMilliseconds(1 * time.Minute) },
                EnableAtModifier:         true,
                EnableNegativeOffset:     true,
                EnablePerStepStats:       enablePerStepStats,
                LookbackDelta:            lookbackDelta,
        })
}

// RunBuiltinTests runs an acceptance test suite against the provided engine.
func RunBuiltinTests(t *testing.T, engine promql.QueryEngine) {
        t.Cleanup(func() { parser.EnableExperimentalFunctions = false })
        parser.EnableExperimentalFunctions = true

        files, err := fs.Glob(testsFs, "*/*.test")
        require.NoError(t, err)

        for _, fn := range files {
                t.Run(fn, func(t *testing.T) {
                        content, err := fs.ReadFile(testsFs, fn)
                        require.NoError(t, err)
                        RunTest(t, string(content), engine)
                })
        }
}

// RunTest parses and runs the test against the provided engine.
func RunTest(t testutil.T, input string, engine promql.QueryEngine) {
        require.NoError(t, runTest(t, input, engine))
}

func runTest(t testutil.T, input string, engine promql.QueryEngine) error {
        test, err := newTest(t, input)

        // Why do this before checking err? newTest() can create the test storage and then return an error,
        // and we want to make sure to clean that up to avoid leaking goroutines.
        defer func() {
                if test == nil {
                        return
                }
                if test.storage != nil {
                        test.storage.Close()
                }
                if test.cancelCtx != nil {
                        test.cancelCtx()
                }
        }()

        if err != nil {
                return err
        }

        for _, cmd := range test.cmds {
                if err := test.exec(cmd, engine); err != nil {
                        // TODO(fabxc): aggregate command errors, yield diffs for result
                        // comparison errors.
                        return err
                }
        }

        return nil
}

// test is a sequence of read and write commands that are run
// against a test storage.
type test struct {
        testutil.T

        cmds []testCommand

        storage *teststorage.TestStorage

        context   context.Context
        cancelCtx context.CancelFunc
}

// newTest returns an initialized empty Test.
func newTest(t testutil.T, input string) (*test, error) {
        test := &test{
                T:    t,
                cmds: []testCommand{},
        }
        err := test.parse(input)
        test.clear()

        return test, err
}

//go:embed testdata
var testsFs embed.FS

func raise(line int, format string, v ...interface{}) error {
        return &parser.ParseErr{
                LineOffset: line,
                Err:        fmt.Errorf(format, v...),
        }
}

func parseLoad(lines []string, i int) (int, *loadCmd, error) {
        if !patLoad.MatchString(lines[i]) {
                return i, nil, raise(i, "invalid load command. (load[_with_nhcb] <step:duration>)")
        }
        parts := patLoad.FindStringSubmatch(lines[i])
        var (
                withNHCB = parts[1] == "with_nhcb"
                step     = parts[2]
        )
        gap, err := model.ParseDuration(step)
        if err != nil {
                return i, nil, raise(i, "invalid step definition %q: %s", step, err)
        }
        cmd := newLoadCmd(time.Duration(gap), withNHCB)
        for i+1 < len(lines) {
                i++
                defLine := lines[i]
                if len(defLine) == 0 {
                        i--
                        break
                }
                metric, vals, err := parseSeries(defLine, i)
                if err != nil {
                        return i, nil, err
                }
                cmd.set(metric, vals...)
        }
        return i, cmd, nil
}

func parseSeries(defLine string, line int) (labels.Labels, []parser.SequenceValue, error) {
        metric, vals, err := parser.ParseSeriesDesc(defLine)
        if err != nil {
                parser.EnrichParseError(err, func(parseErr *parser.ParseErr) {
                        parseErr.LineOffset = line
                })
                return labels.Labels{}, nil, err
        }
        return metric, vals, nil
}

func (t *test) parseEval(lines []string, i int) (int, *evalCmd, error) {
        instantParts := patEvalInstant.FindStringSubmatch(lines[i])
        rangeParts := patEvalRange.FindStringSubmatch(lines[i])

        if instantParts == nil && rangeParts == nil {
                return i, nil, raise(i, "invalid evaluation command. Must be either 'eval[_fail|_warn|_ordered] instant [at <offset:duration>] <query>' or 'eval[_fail|_warn] range from <from> to <to> step <step> <query>'")
        }

        isInstant := instantParts != nil

        var mod string
        var expr string

        if isInstant {
                mod = instantParts[1]
                expr = instantParts[3]
        } else {
                mod = rangeParts[1]
                expr = rangeParts[5]
        }

        _, err := parser.ParseExpr(expr)
        if err != nil {
                parser.EnrichParseError(err, func(parseErr *parser.ParseErr) {
                        parseErr.LineOffset = i
                        posOffset := posrange.Pos(strings.Index(lines[i], expr))
                        parseErr.PositionRange.Start += posOffset
                        parseErr.PositionRange.End += posOffset
                        parseErr.Query = lines[i]
                })
                return i, nil, err
        }

        formatErr := func(format string, args ...any) error {
                combinedArgs := []any{expr, i + 1}

                combinedArgs = append(combinedArgs, args...)
                return fmt.Errorf("error in eval %s (line %v): "+format, combinedArgs...)
        }

        var cmd *evalCmd

        if isInstant {
                at := instantParts[2]
                offset, err := model.ParseDuration(at)
                if err != nil {
                        return i, nil, formatErr("invalid timestamp definition %q: %s", at, err)
                }
                ts := testStartTime.Add(time.Duration(offset))
                cmd = newInstantEvalCmd(expr, ts, i+1)
        } else {
                from := rangeParts[2]
                to := rangeParts[3]
                step := rangeParts[4]

                parsedFrom, err := model.ParseDuration(from)
                if err != nil {
                        return i, nil, formatErr("invalid start timestamp definition %q: %s", from, err)
                }

                parsedTo, err := model.ParseDuration(to)
                if err != nil {
                        return i, nil, formatErr("invalid end timestamp definition %q: %s", to, err)
                }

                if parsedTo < parsedFrom {
                        return i, nil, formatErr("invalid test definition, end timestamp (%s) is before start timestamp (%s)", to, from)
                }

                parsedStep, err := model.ParseDuration(step)
                if err != nil {
                        return i, nil, formatErr("invalid step definition %q: %s", step, err)
                }

                cmd = newRangeEvalCmd(expr, testStartTime.Add(time.Duration(parsedFrom)), testStartTime.Add(time.Duration(parsedTo)), time.Duration(parsedStep), i+1)
        }

        switch mod {
        case "ordered":
                // Ordered results are not supported for range queries, but the regex for range query commands does not allow
                // asserting an ordered result, so we don't need to do any error checking here.
                cmd.ordered = true
        case "fail":
                cmd.fail = true
        case "warn":
                cmd.warn = true
        }

        for j := 1; i+1 < len(lines); j++ {
                i++
                defLine := lines[i]
                if len(defLine) == 0 {
                        i--
                        break
                }

                if cmd.fail && strings.HasPrefix(defLine, "expected_fail_message") {
                        cmd.expectedFailMessage = strings.TrimSpace(strings.TrimPrefix(defLine, "expected_fail_message"))
                        break
                }

                if cmd.fail && strings.HasPrefix(defLine, "expected_fail_regexp") {
                        pattern := strings.TrimSpace(strings.TrimPrefix(defLine, "expected_fail_regexp"))
                        cmd.expectedFailRegexp, err = regexp.Compile(pattern)
                        if err != nil {
                                return i, nil, formatErr("invalid regexp '%s' for expected_fail_regexp: %w", pattern, err)
                        }
                        break
                }

                if f, err := parseNumber(defLine); err == nil {
                        cmd.expect(0, parser.SequenceValue{Value: f})
                        break
                }
                metric, vals, err := parseSeries(defLine, i)
                if err != nil {
                        return i, nil, err
                }

                // Currently, we are not expecting any matrices.
                if len(vals) > 1 && isInstant {
                        return i, nil, formatErr("expecting multiple values in instant evaluation not allowed")
                }
                cmd.expectMetric(j, metric, vals...)
        }
        return i, cmd, nil
}

// getLines returns trimmed lines after removing the comments.
func getLines(input string) []string {
        lines := strings.Split(input, "\n")
        for i, l := range lines {
                l = strings.TrimSpace(l)
                if strings.HasPrefix(l, "#") {
                        l = ""
                }
                lines[i] = l
        }
        return lines
}

// parse the given command sequence and appends it to the test.
func (t *test) parse(input string) error {
        lines := getLines(input)
        var err error
        // Scan for steps line by line.
        for i := 0; i < len(lines); i++ {
                l := lines[i]
                if len(l) == 0 {
                        continue
                }
                var cmd testCommand

                switch c := strings.ToLower(patSpace.Split(l, 2)[0]); {
                case c == "clear":
                        cmd = &clearCmd{}
                case strings.HasPrefix(c, "load"):
                        i, cmd, err = parseLoad(lines, i)
                case strings.HasPrefix(c, "eval"):
                        i, cmd, err = t.parseEval(lines, i)
                default:
                        return raise(i, "invalid command %q", l)
                }
                if err != nil {
                        return err
                }
                t.cmds = append(t.cmds, cmd)
        }
        return nil
}

// testCommand is an interface that ensures that only the package internal
// types can be a valid command for a test.
type testCommand interface {
        testCmd()
}

func (*clearCmd) testCmd() {}
func (*loadCmd) testCmd()  {}
func (*evalCmd) testCmd()  {}

// loadCmd is a command that loads sequences of sample values for specific
// metrics into the storage.
type loadCmd struct {
        gap       time.Duration
        metrics   map[uint64]labels.Labels
        defs      map[uint64][]promql.Sample
        exemplars map[uint64][]exemplar.Exemplar
        withNHCB  bool
}

func newLoadCmd(gap time.Duration, withNHCB bool) *loadCmd {
        return &loadCmd{
                gap:       gap,
                metrics:   map[uint64]labels.Labels{},
                defs:      map[uint64][]promql.Sample{},
                exemplars: map[uint64][]exemplar.Exemplar{},
                withNHCB:  withNHCB,
        }
}

func (cmd loadCmd) String() string {
        return "load"
}

// set a sequence of sample values for the given metric.
func (cmd *loadCmd) set(m labels.Labels, vals ...parser.SequenceValue) {
        h := m.Hash()

        samples := make([]promql.Sample, 0, len(vals))
        ts := testStartTime
        for _, v := range vals {
                if !v.Omitted {
                        samples = append(samples, promql.Sample{
                                T: ts.UnixNano() / int64(time.Millisecond/time.Nanosecond),
                                F: v.Value,
                                H: v.Histogram,
                        })
                }
                ts = ts.Add(cmd.gap)
        }
        cmd.defs[h] = samples
        cmd.metrics[h] = m
}

// append the defined time series to the storage.
func (cmd *loadCmd) append(a storage.Appender) error {
        for h, smpls := range cmd.defs {
                m := cmd.metrics[h]

                for _, s := range smpls {
                        if err := appendSample(a, s, m); err != nil {
                                return err
                        }
                }
        }
        if cmd.withNHCB {
                return cmd.appendCustomHistogram(a)
        }
        return nil
}

func getHistogramMetricBase(m labels.Labels, suffix string) (labels.Labels, uint64) {
        mName := m.Get(labels.MetricName)
        baseM := labels.NewBuilder(m).
                Set(labels.MetricName, strings.TrimSuffix(mName, suffix)).
                Del(labels.BucketLabel).
                Labels()
        hash := baseM.Hash()
        return baseM, hash
}

type tempHistogramWrapper struct {
        metric        labels.Labels
        upperBounds   []float64
        histogramByTs map[int64]tempHistogram
}

func newTempHistogramWrapper() tempHistogramWrapper {
        return tempHistogramWrapper{
                upperBounds:   []float64{},
                histogramByTs: map[int64]tempHistogram{},
        }
}

type tempHistogram struct {
        bucketCounts map[float64]float64
        count        float64
        sum          float64
}

func newTempHistogram() tempHistogram {
        return tempHistogram{
                bucketCounts: map[float64]float64{},
        }
}

func processClassicHistogramSeries(m labels.Labels, suffix string, histogramMap map[uint64]tempHistogramWrapper, smpls []promql.Sample, updateHistogramWrapper func(*tempHistogramWrapper), updateHistogram func(*tempHistogram, float64)) {
        m2, m2hash := getHistogramMetricBase(m, suffix)
        histogramWrapper, exists := histogramMap[m2hash]
        if !exists {
                histogramWrapper = newTempHistogramWrapper()
        }
        histogramWrapper.metric = m2
        if updateHistogramWrapper != nil {
                updateHistogramWrapper(&histogramWrapper)
        }
        for _, s := range smpls {
                if s.H != nil {
                        continue
                }
                histogram, exists := histogramWrapper.histogramByTs[s.T]
                if !exists {
                        histogram = newTempHistogram()
                }
                updateHistogram(&histogram, s.F)
                histogramWrapper.histogramByTs[s.T] = histogram
        }
        histogramMap[m2hash] = histogramWrapper
}

func processUpperBoundsAndCreateBaseHistogram(upperBounds0 []float64) ([]float64, *histogram.FloatHistogram) {
        sort.Float64s(upperBounds0)
        upperBounds := make([]float64, 0, len(upperBounds0))
        prevLE := math.Inf(-1)
        for _, le := range upperBounds0 {
                if le != prevLE { // deduplicate
                        upperBounds = append(upperBounds, le)
                        prevLE = le
                }
        }
        var customBounds []float64
        if upperBounds[len(upperBounds)-1] == math.Inf(1) {
                customBounds = upperBounds[:len(upperBounds)-1]
        } else {
                customBounds = upperBounds
        }
        return upperBounds, &histogram.FloatHistogram{
                Count:  0,
                Sum:    0,
                Schema: histogram.CustomBucketsSchema,
                PositiveSpans: []histogram.Span{
                        {Offset: 0, Length: uint32(len(upperBounds))},
                },
                PositiveBuckets: make([]float64, len(upperBounds)),
                CustomValues:    customBounds,
        }
}

// If classic histograms are defined, convert them into native histograms with custom
// bounds and append the defined time series to the storage.
func (cmd *loadCmd) appendCustomHistogram(a storage.Appender) error {
        histogramMap := map[uint64]tempHistogramWrapper{}

        // Go through all the time series to collate classic histogram data
        // and organise them by timestamp.
        for hash, smpls := range cmd.defs {
                m := cmd.metrics[hash]
                mName := m.Get(labels.MetricName)
                switch {
                case strings.HasSuffix(mName, "_bucket") && m.Has(labels.BucketLabel):
                        le, err := strconv.ParseFloat(m.Get(labels.BucketLabel), 64)
                        if err != nil || math.IsNaN(le) {
                                continue
                        }
                        processClassicHistogramSeries(m, "_bucket", histogramMap, smpls, func(histogramWrapper *tempHistogramWrapper) {
                                histogramWrapper.upperBounds = append(histogramWrapper.upperBounds, le)
                        }, func(histogram *tempHistogram, f float64) {
                                histogram.bucketCounts[le] = f
                        })
                case strings.HasSuffix(mName, "_count"):
                        processClassicHistogramSeries(m, "_count", histogramMap, smpls, nil, func(histogram *tempHistogram, f float64) {
                                histogram.count = f
                        })
                case strings.HasSuffix(mName, "_sum"):
                        processClassicHistogramSeries(m, "_sum", histogramMap, smpls, nil, func(histogram *tempHistogram, f float64) {
                                histogram.sum = f
                        })
                }
        }

        // Convert the collated classic histogram data into native histograms
        // with custom bounds and append them to the storage.
        for _, histogramWrapper := range histogramMap {
                upperBounds, fhBase := processUpperBoundsAndCreateBaseHistogram(histogramWrapper.upperBounds)
                samples := make([]promql.Sample, 0, len(histogramWrapper.histogramByTs))
                for t, histogram := range histogramWrapper.histogramByTs {
                        fh := fhBase.Copy()
                        var prevCount, total float64
                        for i, le := range upperBounds {
                                currCount, exists := histogram.bucketCounts[le]
                                if !exists {
                                        currCount = 0
                                }
                                count := currCount - prevCount
                                fh.PositiveBuckets[i] = count
                                total += count
                                prevCount = currCount
                        }
                        fh.Sum = histogram.sum
                        if histogram.count != 0 {
                                total = histogram.count
                        }
                        fh.Count = total
                        s := promql.Sample{T: t, H: fh.Compact(0)}
                        if err := s.H.Validate(); err != nil {
                                return err
                        }
                        samples = append(samples, s)
                }
                sort.Slice(samples, func(i, j int) bool { return samples[i].T < samples[j].T })
                for _, s := range samples {
                        if err := appendSample(a, s, histogramWrapper.metric); err != nil {
                                return err
                        }
                }
        }
        return nil
}

func appendSample(a storage.Appender, s promql.Sample, m labels.Labels) error {
        if s.H != nil {
                if _, err := a.AppendHistogram(0, m, s.T, nil, s.H); err != nil {
                        return err
                }
        } else {
                if _, err := a.Append(0, m, s.T, s.F); err != nil {
                        return err
                }
        }
        return nil
}

// evalCmd is a command that evaluates an expression for the given time (range)
// and expects a specific result.
type evalCmd struct {
        expr  string
        start time.Time
        end   time.Time
        step  time.Duration
        line  int

        isRange             bool // if false, instant query
        fail, warn, ordered bool
        expectedFailMessage string
        expectedFailRegexp  *regexp.Regexp

        metrics  map[uint64]labels.Labels
        expected map[uint64]entry
}

type entry struct {
        pos  int
        vals []parser.SequenceValue
}

func (e entry) String() string {
        return fmt.Sprintf("%d: %s", e.pos, e.vals)
}

func newInstantEvalCmd(expr string, start time.Time, line int) *evalCmd {
        return &evalCmd{
                expr:  expr,
                start: start,
                line:  line,

                metrics:  map[uint64]labels.Labels{},
                expected: map[uint64]entry{},
        }
}

func newRangeEvalCmd(expr string, start, end time.Time, step time.Duration, line int) *evalCmd {
        return &evalCmd{
                expr:    expr,
                start:   start,
                end:     end,
                step:    step,
                line:    line,
                isRange: true,

                metrics:  map[uint64]labels.Labels{},
                expected: map[uint64]entry{},
        }
}

func (ev *evalCmd) String() string {
        return "eval"
}

// expect adds a sequence of values to the set of expected
// results for the query.
func (ev *evalCmd) expect(pos int, vals ...parser.SequenceValue) {
        ev.expected[0] = entry{pos: pos, vals: vals}
}

// expectMetric adds a new metric with a sequence of values to the set of expected
// results for the query.
func (ev *evalCmd) expectMetric(pos int, m labels.Labels, vals ...parser.SequenceValue) {
        h := m.Hash()
        ev.metrics[h] = m
        ev.expected[h] = entry{pos: pos, vals: vals}
}

// compareResult compares the result value with the defined expectation.
func (ev *evalCmd) compareResult(result parser.Value) error {
        switch val := result.(type) {
        case promql.Matrix:
                if ev.ordered {
                        return fmt.Errorf("expected ordered result, but query returned a matrix")
                }

                if err := assertMatrixSorted(val); err != nil {
                        return err
                }

                seen := map[uint64]bool{}
                for _, s := range val {
                        hash := s.Metric.Hash()
                        if _, ok := ev.metrics[hash]; !ok {
                                return fmt.Errorf("unexpected metric %s in result, has %s", s.Metric, formatSeriesResult(s))
                        }
                        seen[hash] = true
                        exp := ev.expected[hash]

                        var expectedFloats []promql.FPoint
                        var expectedHistograms []promql.HPoint

                        for i, e := range exp.vals {
                                ts := ev.start.Add(time.Duration(i) * ev.step)

                                if ts.After(ev.end) {
                                        return fmt.Errorf("expected %v points for %s, but query time range cannot return this many points", len(exp.vals), ev.metrics[hash])
                                }

                                t := ts.UnixNano() / int64(time.Millisecond/time.Nanosecond)

                                if e.Histogram != nil {
                                        expectedHistograms = append(expectedHistograms, promql.HPoint{T: t, H: e.Histogram})
                                } else if !e.Omitted {
                                        expectedFloats = append(expectedFloats, promql.FPoint{T: t, F: e.Value})
                                }
                        }

                        if len(expectedFloats) != len(s.Floats) || len(expectedHistograms) != len(s.Histograms) {
                                return fmt.Errorf("expected %v float points and %v histogram points for %s, but got %s", len(expectedFloats), len(expectedHistograms), ev.metrics[hash], formatSeriesResult(s))
                        }

                        for i, expected := range expectedFloats {
                                actual := s.Floats[i]

                                if expected.T != actual.T {
                                        return fmt.Errorf("expected float value at index %v for %s to have timestamp %v, but it had timestamp %v (result has %s)", i, ev.metrics[hash], expected.T, actual.T, formatSeriesResult(s))
                                }

                                if !almost.Equal(actual.F, expected.F, defaultEpsilon) {
                                        return fmt.Errorf("expected float value at index %v (t=%v) for %s to be %v, but got %v (result has %s)", i, actual.T, ev.metrics[hash], expected.F, actual.F, formatSeriesResult(s))
                                }
                        }

                        for i, expected := range expectedHistograms {
                                actual := s.Histograms[i]

                                if expected.T != actual.T {
                                        return fmt.Errorf("expected histogram value at index %v for %s to have timestamp %v, but it had timestamp %v (result has %s)", i, ev.metrics[hash], expected.T, actual.T, formatSeriesResult(s))
                                }

                                if !actual.H.Equals(expected.H.Compact(0)) {
                                        return fmt.Errorf("expected histogram value at index %v (t=%v) for %s to be %v, but got %v (result has %s)", i, actual.T, ev.metrics[hash], expected.H, actual.H, formatSeriesResult(s))
                                }
                        }
                }

                for hash := range ev.expected {
                        if !seen[hash] {
                                return fmt.Errorf("expected metric %s not found", ev.metrics[hash])
                        }
                }

        case promql.Vector:
                seen := map[uint64]bool{}
                for pos, v := range val {
                        fp := v.Metric.Hash()
                        if _, ok := ev.metrics[fp]; !ok {
                                if v.H != nil {
                                        return fmt.Errorf("unexpected metric %s in result, has value %v", v.Metric, v.H)
                                }

                                return fmt.Errorf("unexpected metric %s in result, has value %v", v.Metric, v.F)
                        }
                        exp := ev.expected[fp]
                        if ev.ordered && exp.pos != pos+1 {
                                return fmt.Errorf("expected metric %s with %v at position %d but was at %d", v.Metric, exp.vals, exp.pos, pos+1)
                        }
                        exp0 := exp.vals[0]
                        expH := exp0.Histogram
                        if expH == nil && v.H != nil {
                                return fmt.Errorf("expected float value %v for %s but got histogram %s", exp0, v.Metric, HistogramTestExpression(v.H))
                        }
                        if expH != nil && v.H == nil {
                                return fmt.Errorf("expected histogram %s for %s but got float value %v", HistogramTestExpression(expH), v.Metric, v.F)
                        }
                        if expH != nil && !expH.Compact(0).Equals(v.H) {
                                return fmt.Errorf("expected %v for %s but got %s", HistogramTestExpression(expH), v.Metric, HistogramTestExpression(v.H))
                        }
                        if !almost.Equal(exp0.Value, v.F, defaultEpsilon) {
                                return fmt.Errorf("expected %v for %s but got %v", exp0.Value, v.Metric, v.F)
                        }

                        seen[fp] = true
                }
                for fp, expVals := range ev.expected {
                        if !seen[fp] {
                                return fmt.Errorf("expected metric %s with %v not found", ev.metrics[fp], expVals)
                        }
                }

        case promql.Scalar:
                if len(ev.expected) != 1 {
                        return fmt.Errorf("expected vector result, but got scalar %s", val.String())
                }
                exp0 := ev.expected[0].vals[0]
                if exp0.Histogram != nil {
                        return fmt.Errorf("expected Histogram %v but got scalar %s", exp0.Histogram.TestExpression(), val.String())
                }
                if !almost.Equal(exp0.Value, val.V, defaultEpsilon) {
                        return fmt.Errorf("expected Scalar %v but got %v", val.V, exp0.Value)
                }

        default:
                panic(fmt.Errorf("promql.Test.compareResult: unexpected result type %T", result))
        }
        return nil
}

func (ev *evalCmd) checkExpectedFailure(actual error) error {
        if ev.expectedFailMessage != "" {
                if ev.expectedFailMessage != actual.Error() {
                        return fmt.Errorf("expected error %q evaluating query %q (line %d), but got: %s", ev.expectedFailMessage, ev.expr, ev.line, actual.Error())
                }
        }

        if ev.expectedFailRegexp != nil {
                if !ev.expectedFailRegexp.MatchString(actual.Error()) {
                        return fmt.Errorf("expected error matching pattern %q evaluating query %q (line %d), but got: %s", ev.expectedFailRegexp.String(), ev.expr, ev.line, actual.Error())
                }
        }

        // We're not expecting a particular error, or we got the error we expected.
        // This test passes.
        return nil
}

func formatSeriesResult(s promql.Series) string {
        floatPlural := "s"
        histogramPlural := "s"

        if len(s.Floats) == 1 {
                floatPlural = ""
        }

        if len(s.Histograms) == 1 {
                histogramPlural = ""
        }

        return fmt.Sprintf("%v float point%s %v and %v histogram point%s %v", len(s.Floats), floatPlural, s.Floats, len(s.Histograms), histogramPlural, s.Histograms)
}

// HistogramTestExpression returns TestExpression() for the given histogram or "" if the histogram is nil.
func HistogramTestExpression(h *histogram.FloatHistogram) string {
        if h != nil {
                return h.TestExpression()
        }
        return ""
}

// clearCmd is a command that wipes the test's storage state.
type clearCmd struct{}

func (cmd clearCmd) String() string {
        return "clear"
}

type atModifierTestCase struct {
        expr     string
        evalTime time.Time
}

func atModifierTestCases(exprStr string, evalTime time.Time) ([]atModifierTestCase, error) {
        expr, err := parser.ParseExpr(exprStr)
        if err != nil {
                return nil, err
        }
        ts := timestamp.FromTime(evalTime)

        containsNonStepInvariant := false
        // Setting the @ timestamp for all selectors to be evalTime.
        // If there is a subquery, then the selectors inside it don't get the @ timestamp.
        // If any selector already has the @ timestamp set, then it is untouched.
        parser.Inspect(expr, func(node parser.Node, path []parser.Node) error {
                if hasAtModifier(path) {
                        // There is a subquery with timestamp in the path,
                        // hence don't change any timestamps further.
                        return nil
                }
                switch n := node.(type) {
                case *parser.VectorSelector:
                        if n.Timestamp == nil {
                                n.Timestamp = makeInt64Pointer(ts)
                        }

                case *parser.MatrixSelector:
                        if vs := n.VectorSelector.(*parser.VectorSelector); vs.Timestamp == nil {
                                vs.Timestamp = makeInt64Pointer(ts)
                        }

                case *parser.SubqueryExpr:
                        if n.Timestamp == nil {
                                n.Timestamp = makeInt64Pointer(ts)
                        }

                case *parser.Call:
                        _, ok := promql.AtModifierUnsafeFunctions[n.Func.Name]
                        containsNonStepInvariant = containsNonStepInvariant || ok
                }
                return nil
        })

        if containsNonStepInvariant {
                // Expression contains a function whose result can vary with evaluation
                // time, even though its arguments are step invariant: skip it.
                return nil, nil
        }

        newExpr := expr.String() // With all the @ evalTime set.
        additionalEvalTimes := []int64{-10 * ts, 0, ts / 5, ts, 10 * ts}
        if ts == 0 {
                additionalEvalTimes = []int64{-1000, -ts, 1000}
        }
        testCases := make([]atModifierTestCase, 0, len(additionalEvalTimes))
        for _, et := range additionalEvalTimes {
                testCases = append(testCases, atModifierTestCase{
                        expr:     newExpr,
                        evalTime: timestamp.Time(et),
                })
        }

        return testCases, nil
}

func hasAtModifier(path []parser.Node) bool {
        for _, node := range path {
                if n, ok := node.(*parser.SubqueryExpr); ok {
                        if n.Timestamp != nil {
                                return true
                        }
                }
        }
        return false
}

// exec processes a single step of the test.
func (t *test) exec(tc testCommand, engine promql.QueryEngine) error {
        switch cmd := tc.(type) {
        case *clearCmd:
                t.clear()

        case *loadCmd:
                app := t.storage.Appender(t.context)
                if err := cmd.append(app); err != nil {
                        app.Rollback()
                        return err
                }

                if err := app.Commit(); err != nil {
                        return err
                }

        case *evalCmd:
                return t.execEval(cmd, engine)

        default:
                panic("promql.Test.exec: unknown test command type")
        }
        return nil
}

func (t *test) execEval(cmd *evalCmd, engine promql.QueryEngine) error {
        if cmd.isRange {
                return t.execRangeEval(cmd, engine)
        }

        return t.execInstantEval(cmd, engine)
}

func (t *test) execRangeEval(cmd *evalCmd, engine promql.QueryEngine) error {
        q, err := engine.NewRangeQuery(t.context, t.storage, nil, cmd.expr, cmd.start, cmd.end, cmd.step)
        if err != nil {
                return fmt.Errorf("error creating range query for %q (line %d): %w", cmd.expr, cmd.line, err)
        }
        res := q.Exec(t.context)
        countWarnings, _ := res.Warnings.CountWarningsAndInfo()
        if !cmd.warn && countWarnings > 0 {
                return fmt.Errorf("unexpected warnings evaluating query %q (line %d): %v", cmd.expr, cmd.line, res.Warnings)
        }
        if cmd.warn && countWarnings == 0 {
                return fmt.Errorf("expected warnings evaluating query %q (line %d) but got none", cmd.expr, cmd.line)
        }
        if res.Err != nil {
                if cmd.fail {
                        return cmd.checkExpectedFailure(res.Err)
                }

                return fmt.Errorf("error evaluating query %q (line %d): %w", cmd.expr, cmd.line, res.Err)
        }
        if res.Err == nil && cmd.fail {
                return fmt.Errorf("expected error evaluating query %q (line %d) but got none", cmd.expr, cmd.line)
        }
        defer q.Close()

        if err := cmd.compareResult(res.Value); err != nil {
                return fmt.Errorf("error in %s %s (line %d): %w", cmd, cmd.expr, cmd.line, err)
        }

        return nil
}

func (t *test) execInstantEval(cmd *evalCmd, engine promql.QueryEngine) error {
        queries, err := atModifierTestCases(cmd.expr, cmd.start)
        if err != nil {
                return err
        }
        queries = append([]atModifierTestCase{{expr: cmd.expr, evalTime: cmd.start}}, queries...)
        for _, iq := range queries {
                if err := t.runInstantQuery(iq, cmd, engine); err != nil {
                        return err
                }
        }
        return nil
}

func (t *test) runInstantQuery(iq atModifierTestCase, cmd *evalCmd, engine promql.QueryEngine) error {
        q, err := engine.NewInstantQuery(t.context, t.storage, nil, iq.expr, iq.evalTime)
        if err != nil {
                return fmt.Errorf("error creating instant query for %q (line %d): %w", cmd.expr, cmd.line, err)
        }
        defer q.Close()
        res := q.Exec(t.context)
        countWarnings, _ := res.Warnings.CountWarningsAndInfo()
        if !cmd.warn && countWarnings > 0 {
                return fmt.Errorf("unexpected warnings evaluating query %q (line %d): %v", iq.expr, cmd.line, res.Warnings)
        }
        if cmd.warn && countWarnings == 0 {
                return fmt.Errorf("expected warnings evaluating query %q (line %d) but got none", iq.expr, cmd.line)
        }
        if res.Err != nil {
                if cmd.fail {
                        if err := cmd.checkExpectedFailure(res.Err); err != nil {
                                return err
                        }

                        return nil
                }
                return fmt.Errorf("error evaluating query %q (line %d): %w", iq.expr, cmd.line, res.Err)
        }
        if res.Err == nil && cmd.fail {
                return fmt.Errorf("expected error evaluating query %q (line %d) but got none", iq.expr, cmd.line)
        }
        err = cmd.compareResult(res.Value)
        if err != nil {
                return fmt.Errorf("error in %s %s (line %d): %w", cmd, iq.expr, cmd.line, err)
        }

        // Check query returns same result in range mode,
        // by checking against the middle step.
        q, err = engine.NewRangeQuery(t.context, t.storage, nil, iq.expr, iq.evalTime.Add(-time.Minute), iq.evalTime.Add(time.Minute), time.Minute)
        if err != nil {
                return fmt.Errorf("error creating range query for %q (line %d): %w", cmd.expr, cmd.line, err)
        }
        rangeRes := q.Exec(t.context)
        if rangeRes.Err != nil {
                return fmt.Errorf("error evaluating query %q (line %d) in range mode: %w", iq.expr, cmd.line, rangeRes.Err)
        }
        defer q.Close()
        if cmd.ordered {
                // Range queries are always sorted by labels, so skip this test case that expects results in a particular order.
                return nil
        }
        mat := rangeRes.Value.(promql.Matrix)
        if err := assertMatrixSorted(mat); err != nil {
                return err
        }

        vec := make(promql.Vector, 0, len(mat))
        for _, series := range mat {
                // We expect either Floats or Histograms.
                for _, point := range series.Floats {
                        if point.T == timeMilliseconds(iq.evalTime) {
                                vec = append(vec, promql.Sample{Metric: series.Metric, T: point.T, F: point.F})
                                break
                        }
                }
                for _, point := range series.Histograms {
                        if point.T == timeMilliseconds(iq.evalTime) {
                                vec = append(vec, promql.Sample{Metric: series.Metric, T: point.T, H: point.H})
                                break
                        }
                }
        }
        if _, ok := res.Value.(promql.Scalar); ok {
                err = cmd.compareResult(promql.Scalar{V: vec[0].F})
        } else {
                err = cmd.compareResult(vec)
        }
        if err != nil {
                return fmt.Errorf("error in %s %s (line %d) range mode: %w", cmd, iq.expr, cmd.line, err)
        }
        return nil
}

func assertMatrixSorted(m promql.Matrix) error {
        if len(m) <= 1 {
                return nil
        }

        for i, s := range m[:len(m)-1] {
                nextIndex := i + 1
                nextMetric := m[nextIndex].Metric

                if labels.Compare(s.Metric, nextMetric) > 0 {
                        return fmt.Errorf("matrix results should always be sorted by labels, but matrix is not sorted: series at index %v with labels %s sorts before series at index %v with labels %s", nextIndex, nextMetric, i, s.Metric)
                }
        }

        return nil
}

// clear the current test storage of all inserted samples.
func (t *test) clear() {
        if t.storage != nil {
                err := t.storage.Close()
                require.NoError(t.T, err, "Unexpected error while closing test storage.")
        }
        if t.cancelCtx != nil {
                t.cancelCtx()
        }
        t.storage = teststorage.New(t)
        t.context, t.cancelCtx = context.WithCancel(context.Background())
}

func parseNumber(s string) (float64, error) {
        n, err := strconv.ParseInt(s, 0, 64)
        f := float64(n)
        if err != nil {
                f, err = strconv.ParseFloat(s, 64)
        }
        if err != nil {
                return 0, fmt.Errorf("error parsing number: %w", err)
        }
        return f, nil
}

// LazyLoader lazily loads samples into storage.
// This is specifically implemented for unit testing of rules.
type LazyLoader struct {
        loadCmd *loadCmd

        storage          storage.Storage
        SubqueryInterval time.Duration

        queryEngine *promql.Engine
        context     context.Context
        cancelCtx   context.CancelFunc

        opts LazyLoaderOpts
}

// LazyLoaderOpts are options for the lazy loader.
type LazyLoaderOpts struct {
        // Both of these must be set to true for regular PromQL (as of
        // Prometheus v2.33). They can still be disabled here for legacy and
        // other uses.
        EnableAtModifier, EnableNegativeOffset bool
}

// NewLazyLoader returns an initialized empty LazyLoader.
func NewLazyLoader(input string, opts LazyLoaderOpts) (*LazyLoader, error) {
        ll := &LazyLoader{
                opts: opts,
        }
        err := ll.parse(input)
        if err != nil {
                return nil, err
        }
        err = ll.clear()
        return ll, err
}

// parse the given load command.
func (ll *LazyLoader) parse(input string) error {
        lines := getLines(input)
        // Accepts only 'load' command.
        for i := 0; i < len(lines); i++ {
                l := lines[i]
                if len(l) == 0 {
                        continue
                }
                if strings.HasPrefix(strings.ToLower(patSpace.Split(l, 2)[0]), "load") {
                        _, cmd, err := parseLoad(lines, i)
                        if err != nil {
                                return err
                        }
                        ll.loadCmd = cmd
                        return nil
                }

                return raise(i, "invalid command %q", l)
        }
        return errors.New("no \"load\" command found")
}

// clear the current test storage of all inserted samples.
func (ll *LazyLoader) clear() error {
        if ll.storage != nil {
                if err := ll.storage.Close(); err != nil {
                        return fmt.Errorf("closing test storage: %w", err)
                }
        }
        if ll.cancelCtx != nil {
                ll.cancelCtx()
        }
        var err error
        ll.storage, err = teststorage.NewWithError()
        if err != nil {
                return err
        }

        opts := promql.EngineOpts{
                Logger:                   nil,
                Reg:                      nil,
                MaxSamples:               10000,
                Timeout:                  100 * time.Second,
                NoStepSubqueryIntervalFn: func(int64) int64 { return durationMilliseconds(ll.SubqueryInterval) },
                EnableAtModifier:         ll.opts.EnableAtModifier,
                EnableNegativeOffset:     ll.opts.EnableNegativeOffset,
        }

        ll.queryEngine = promql.NewEngine(opts)
        ll.context, ll.cancelCtx = context.WithCancel(context.Background())
        return nil
}

// appendTill appends the defined time series to the storage till the given timestamp (in milliseconds).
func (ll *LazyLoader) appendTill(ts int64) error {
        app := ll.storage.Appender(ll.Context())
        for h, smpls := range ll.loadCmd.defs {
                m := ll.loadCmd.metrics[h]
                for i, s := range smpls {
                        if s.T > ts {
                                // Removing the already added samples.
                                ll.loadCmd.defs[h] = smpls[i:]
                                break
                        }
                        if err := appendSample(app, s, m); err != nil {
                                return err
                        }
                        if i == len(smpls)-1 {
                                ll.loadCmd.defs[h] = nil
                        }
                }
        }
        return app.Commit()
}

// WithSamplesTill loads the samples till given timestamp and executes the given function.
func (ll *LazyLoader) WithSamplesTill(ts time.Time, fn func(error)) {
        tsMilli := ts.Sub(time.Unix(0, 0).UTC()) / time.Millisecond
        fn(ll.appendTill(int64(tsMilli)))
}

// QueryEngine returns the LazyLoader's query engine.
func (ll *LazyLoader) QueryEngine() *promql.Engine {
        return ll.queryEngine
}

// Queryable allows querying the LazyLoader's data.
// Note: only the samples till the max timestamp used
// in `WithSamplesTill` can be queried.
func (ll *LazyLoader) Queryable() storage.Queryable {
        return ll.storage
}

// Context returns the LazyLoader's context.
func (ll *LazyLoader) Context() context.Context {
        return ll.context
}

// Storage returns the LazyLoader's storage.
func (ll *LazyLoader) Storage() storage.Storage {
        return ll.storage
}

// Close closes resources associated with the LazyLoader.
func (ll *LazyLoader) Close() error {
        ll.cancelCtx()
        return ll.storage.Close()
}

func makeInt64Pointer(val int64) *int64 {
        valp := new(int64)
        *valp = val
        return valp
}

func timeMilliseconds(t time.Time) int64 {
        return t.UnixNano() / int64(time.Millisecond/time.Nanosecond)
}

func durationMilliseconds(d time.Duration) int64 {
        return int64(d / (time.Millisecond / time.Nanosecond))
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package promql

import (
        "math"
        "slices"
        "sort"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/util/almost"
)

// smallDeltaTolerance is the threshold for relative deltas between classic
// histogram buckets that will be ignored by the histogram_quantile function
// because they are most likely artifacts of floating point precision issues.
// Testing on 2 sets of real data with bugs arising from small deltas,
// the safe ranges were from:
// - 1e-05 to 1e-15
// - 1e-06 to 1e-15
// Anything to the left of that would cause non-query-sharded data to have
// small deltas ignored (unnecessary and we should avoid this), and anything
// to the right of that would cause query-sharded data to not have its small
// deltas ignored (so the problem won't be fixed).
// For context, query sharding triggers these float precision errors in Mimir.
// To illustrate, with a relative deviation of 1e-12, we need to have 1e12
// observations in the bucket so that the change of one observation is small
// enough to get ignored. With the usual observation rate even of very busy
// services, this will hardly be reached in timeframes that matters for
// monitoring.
const smallDeltaTolerance = 1e-12

// Helpers to calculate quantiles.

// excludedLabels are the labels to exclude from signature calculation for
// quantiles.
var excludedLabels = []string{
        labels.MetricName,
        labels.BucketLabel,
}

type bucket struct {
        upperBound float64
        count      float64
}

// buckets implements sort.Interface.
type buckets []bucket

type metricWithBuckets struct {
        metric  labels.Labels
        buckets buckets
}

// bucketQuantile calculates the quantile 'q' based on the given buckets. The
// buckets will be sorted by upperBound by this function (i.e. no sorting
// needed before calling this function). The quantile value is interpolated
// assuming a linear distribution within a bucket. However, if the quantile
// falls into the highest bucket, the upper bound of the 2nd highest bucket is
// returned. A natural lower bound of 0 is assumed if the upper bound of the
// lowest bucket is greater 0. In that case, interpolation in the lowest bucket
// happens linearly between 0 and the upper bound of the lowest bucket.
// However, if the lowest bucket has an upper bound less or equal 0, this upper
// bound is returned if the quantile falls into the lowest bucket.
//
// There are a number of special cases (once we have a way to report errors
// happening during evaluations of AST functions, we should report those
// explicitly):
//
// If 'buckets' has 0 observations, NaN is returned.
//
// If 'buckets' has fewer than 2 elements, NaN is returned.
//
// If the highest bucket is not +Inf, NaN is returned.
//
// If q==NaN, NaN is returned.
//
// If q<0, -Inf is returned.
//
// If q>1, +Inf is returned.
//
// We also return a bool to indicate if monotonicity needed to be forced,
// and another bool to indicate if small differences between buckets (that
// are likely artifacts of floating point precision issues) have been
// ignored.
func bucketQuantile(q float64, buckets buckets) (float64, bool, bool) {
        if math.IsNaN(q) {
                return math.NaN(), false, false
        }
        if q < 0 {
                return math.Inf(-1), false, false
        }
        if q > 1 {
                return math.Inf(+1), false, false
        }
        slices.SortFunc(buckets, func(a, b bucket) int {
                // We don't expect the bucket boundary to be a NaN.
                if a.upperBound < b.upperBound {
                        return -1
                }
                if a.upperBound > b.upperBound {
                        return +1
                }
                return 0
        })
        if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) {
                return math.NaN(), false, false
        }

        buckets = coalesceBuckets(buckets)
        forcedMonotonic, fixedPrecision := ensureMonotonicAndIgnoreSmallDeltas(buckets, smallDeltaTolerance)

        if len(buckets) < 2 {
                return math.NaN(), false, false
        }
        observations := buckets[len(buckets)-1].count
        if observations == 0 {
                return math.NaN(), false, false
        }
        rank := q * observations
        b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })

        if b == len(buckets)-1 {
                return buckets[len(buckets)-2].upperBound, forcedMonotonic, fixedPrecision
        }
        if b == 0 && buckets[0].upperBound <= 0 {
                return buckets[0].upperBound, forcedMonotonic, fixedPrecision
        }
        var (
                bucketStart float64
                bucketEnd   = buckets[b].upperBound
                count       = buckets[b].count
        )
        if b > 0 {
                bucketStart = buckets[b-1].upperBound
                count -= buckets[b-1].count
                rank -= buckets[b-1].count
        }
        return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic, fixedPrecision
}

// histogramQuantile calculates the quantile 'q' based on the given histogram.
//
// The quantile value is interpolated assuming a linear distribution within a
// bucket.
// TODO(beorn7): Find an interpolation method that is a better fit for
// exponential buckets (and think about configurable interpolation).
//
// A natural lower bound of 0 is assumed if the histogram has only positive
// buckets. Likewise, a natural upper bound of 0 is assumed if the histogram has
// only negative buckets.
// TODO(beorn7): Come to terms if we want that.
//
// There are a number of special cases (once we have a way to report errors
// happening during evaluations of AST functions, we should report those
// explicitly):
//
// If the histogram has 0 observations, NaN is returned.
//
// If q<0, -Inf is returned.
//
// If q>1, +Inf is returned.
//
// If q is NaN, NaN is returned.
func histogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
        if q < 0 {
                return math.Inf(-1)
        }
        if q > 1 {
                return math.Inf(+1)
        }

        if h.Count == 0 || math.IsNaN(q) {
                return math.NaN()
        }

        var (
                bucket histogram.Bucket[float64]
                count  float64
                it     histogram.BucketIterator[float64]
                rank   float64
        )

        // if there are NaN observations in the histogram (h.Sum is NaN), use the forward iterator
        // if the q < 0.5, use the forward iterator
        // if the q >= 0.5, use the reverse iterator
        if math.IsNaN(h.Sum) || q < 0.5 {
                it = h.AllBucketIterator()
                rank = q * h.Count
        } else {
                it = h.AllReverseBucketIterator()
                rank = (1 - q) * h.Count
        }

        for it.Next() {
                bucket = it.At()
                if bucket.Count == 0 {
                        continue
                }
                count += bucket.Count
                if count >= rank {
                        break
                }
        }
        if !h.UsesCustomBuckets() && bucket.Lower < 0 && bucket.Upper > 0 {
                switch {
                case len(h.NegativeBuckets) == 0 && len(h.PositiveBuckets) > 0:
                        // The result is in the zero bucket and the histogram has only
                        // positive buckets. So we consider 0 to be the lower bound.
                        bucket.Lower = 0
                case len(h.PositiveBuckets) == 0 && len(h.NegativeBuckets) > 0:
                        // The result is in the zero bucket and the histogram has only
                        // negative buckets. So we consider 0 to be the upper bound.
                        bucket.Upper = 0
                }
        } else if h.UsesCustomBuckets() {
                if bucket.Lower == math.Inf(-1) {
                        // first bucket, with lower bound -Inf
                        if bucket.Upper <= 0 {
                                return bucket.Upper
                        }
                        bucket.Lower = 0
                } else if bucket.Upper == math.Inf(1) {
                        // last bucket, with upper bound +Inf
                        return bucket.Lower
                }
        }
        // Due to numerical inaccuracies, we could end up with a higher count
        // than h.Count. Thus, make sure count is never higher than h.Count.
        if count > h.Count {
                count = h.Count
        }
        // We could have hit the highest bucket without even reaching the rank
        // (this should only happen if the histogram contains observations of
        // the value NaN), in which case we simply return the upper limit of the
        // highest explicit bucket.
        if count < rank {
                return bucket.Upper
        }

        // NaN observations increase h.Count but not the total number of
        // observations in the buckets. Therefore, we have to use the forward
        // iterator to find percentiles. We recognize histograms containing NaN
        // observations by checking if their h.Sum is NaN.
        if math.IsNaN(h.Sum) || q < 0.5 {
                rank -= count - bucket.Count
        } else {
                rank = count - rank
        }

        // TODO(codesome): Use a better estimation than linear.
        return bucket.Lower + (bucket.Upper-bucket.Lower)*(rank/bucket.Count)
}

// histogramFraction calculates the fraction of observations between the
// provided lower and upper bounds, based on the provided histogram.
//
// histogramFraction is in a certain way the inverse of histogramQuantile.  If
// histogramQuantile(0.9, h) returns 123.4, then histogramFraction(-Inf, 123.4, h)
// returns 0.9.
//
// The same notes (and TODOs) with regard to interpolation and assumptions about
// the zero bucket boundaries apply as for histogramQuantile.
//
// Whether either boundary is inclusive or exclusive doesn’t actually matter as
// long as interpolation has to be performed anyway. In the case of a boundary
// coinciding with a bucket boundary, the inclusive or exclusive nature of the
// boundary determines the exact behavior of the threshold. With the current
// implementation, that means that lower is exclusive for positive values and
// inclusive for negative values, while upper is inclusive for positive values
// and exclusive for negative values.
//
// Special cases:
//
// If the histogram has 0 observations, NaN is returned.
//
// Use a lower bound of -Inf to get the fraction of all observations below the
// upper bound.
//
// Use an upper bound of +Inf to get the fraction of all observations above the
// lower bound.
//
// If lower or upper is NaN, NaN is returned.
//
// If lower >= upper and the histogram has at least 1 observation, zero is returned.
func histogramFraction(lower, upper float64, h *histogram.FloatHistogram) float64 {
        if h.Count == 0 || math.IsNaN(lower) || math.IsNaN(upper) {
                return math.NaN()
        }
        if lower >= upper {
                return 0
        }

        var (
                rank, lowerRank, upperRank float64
                lowerSet, upperSet         bool
                it                         = h.AllBucketIterator()
        )
        for it.Next() {
                b := it.At()
                if b.Lower < 0 && b.Upper > 0 {
                        switch {
                        case len(h.NegativeBuckets) == 0 && len(h.PositiveBuckets) > 0:
                                // This is the zero bucket and the histogram has only
                                // positive buckets. So we consider 0 to be the lower
                                // bound.
                                b.Lower = 0
                        case len(h.PositiveBuckets) == 0 && len(h.NegativeBuckets) > 0:
                                // This is in the zero bucket and the histogram has only
                                // negative buckets. So we consider 0 to be the upper
                                // bound.
                                b.Upper = 0
                        }
                }
                if !lowerSet && b.Lower >= lower {
                        lowerRank = rank
                        lowerSet = true
                }
                if !upperSet && b.Lower >= upper {
                        upperRank = rank
                        upperSet = true
                }
                if lowerSet && upperSet {
                        break
                }
                if !lowerSet && b.Lower < lower && b.Upper > lower {
                        lowerRank = rank + b.Count*(lower-b.Lower)/(b.Upper-b.Lower)
                        lowerSet = true
                }
                if !upperSet && b.Lower < upper && b.Upper > upper {
                        upperRank = rank + b.Count*(upper-b.Lower)/(b.Upper-b.Lower)
                        upperSet = true
                }
                if lowerSet && upperSet {
                        break
                }
                rank += b.Count
        }
        if !lowerSet || lowerRank > h.Count {
                lowerRank = h.Count
        }
        if !upperSet || upperRank > h.Count {
                upperRank = h.Count
        }

        return (upperRank - lowerRank) / h.Count
}

// coalesceBuckets merges buckets with the same upper bound.
//
// The input buckets must be sorted.
func coalesceBuckets(buckets buckets) buckets {
        last := buckets[0]
        i := 0
        for _, b := range buckets[1:] {
                if b.upperBound == last.upperBound {
                        last.count += b.count
                } else {
                        buckets[i] = last
                        last = b
                        i++
                }
        }
        buckets[i] = last
        return buckets[:i+1]
}

// The assumption that bucket counts increase monotonically with increasing
// upperBound may be violated during:
//
//   - Circumstances where data is already inconsistent at the target's side.
//   - Ingestion via the remote write receiver that Prometheus implements.
//   - Optimisation of query execution where precision is sacrificed for other
//     benefits, not by Prometheus but by systems built on top of it.
//   - Circumstances where floating point precision errors accumulate.
//
// Monotonicity is usually guaranteed because if a bucket with upper bound
// u1 has count c1, then any bucket with a higher upper bound u > u1 must
// have counted all c1 observations and perhaps more, so that c >= c1.
//
// bucketQuantile depends on that monotonicity to do a binary search for the
// bucket with the φ-quantile count, so breaking the monotonicity
// guarantee causes bucketQuantile() to return undefined (nonsense) results.
//
// As a somewhat hacky solution, we first silently ignore any numerically
// insignificant (relative delta below the requested tolerance and likely to
// be from floating point precision errors) differences between successive
// buckets regardless of the direction. Then we calculate the "envelope" of
// the histogram buckets, essentially removing any decreases in the count
// between successive buckets.
//
// We return a bool to indicate if this monotonicity was forced or not, and
// another bool to indicate if small deltas were ignored or not.
func ensureMonotonicAndIgnoreSmallDeltas(buckets buckets, tolerance float64) (bool, bool) {
        var forcedMonotonic, fixedPrecision bool
        prev := buckets[0].count
        for i := 1; i < len(buckets); i++ {
                curr := buckets[i].count // Assumed always positive.
                if curr == prev {
                        // No correction needed if the counts are identical between buckets.
                        continue
                }
                if almost.Equal(prev, curr, tolerance) {
                        // Silently correct numerically insignificant differences from floating
                        // point precision errors, regardless of direction.
                        // Do not update the 'prev' value as we are ignoring the difference.
                        buckets[i].count = prev
                        fixedPrecision = true
                        continue
                }
                if curr < prev {
                        // Force monotonicity by removing any decreases regardless of magnitude.
                        // Do not update the 'prev' value as we are ignoring the decrease.
                        buckets[i].count = prev
                        forcedMonotonic = true
                        continue
                }
                prev = curr
        }
        return forcedMonotonic, fixedPrecision
}

// quantile calculates the given quantile of a vector of samples.
//
// The Vector will be sorted.
// If 'values' has zero elements, NaN is returned.
// If q==NaN, NaN is returned.
// If q<0, -Inf is returned.
// If q>1, +Inf is returned.
func quantile(q float64, values vectorByValueHeap) float64 {
        if len(values) == 0 || math.IsNaN(q) {
                return math.NaN()
        }
        if q < 0 {
                return math.Inf(-1)
        }
        if q > 1 {
                return math.Inf(+1)
        }
        sort.Sort(values)

        n := float64(len(values))
        // When the quantile lies between two samples,
        // we use a weighted average of the two samples.
        rank := q * (n - 1)

        lowerIndex := math.Max(0, math.Floor(rank))
        upperIndex := math.Min(n-1, lowerIndex+1)

        weight := rank - math.Floor(rank)
        return values[int(lowerIndex)].F*(1-weight) + values[int(upperIndex)].F*weight
}

// Copyright 2019 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package promql

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "strings"
        "time"
        "unicode/utf8"

        "github.com/edsrzf/mmap-go"
        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
)

type ActiveQueryTracker struct {
        mmapedFile    []byte
        getNextIndex  chan int
        logger        log.Logger
        closer        io.Closer
        maxConcurrent int
}

var _ io.Closer = &ActiveQueryTracker{}

type Entry struct {
        Query     string `json:"query"`
        Timestamp int64  `json:"timestamp_sec"`
}

const (
        entrySize int = 1000
)

func parseBrokenJSON(brokenJSON []byte) (string, bool) {
        queries := strings.ReplaceAll(string(brokenJSON), "\x00", "")
        if len(queries) > 0 {
                queries = queries[:len(queries)-1] + "]"
        }

        // Conditional because of implementation detail: len() = 1 implies file consisted of a single char: '['.
        if len(queries) <= 1 {
                return "[]", false
        }

        return queries, true
}

func logUnfinishedQueries(filename string, filesize int, logger log.Logger) {
        if _, err := os.Stat(filename); err == nil {
                fd, err := os.Open(filename)
                if err != nil {
                        level.Error(logger).Log("msg", "Failed to open query log file", "err", err)
                        return
                }
                defer fd.Close()

                brokenJSON := make([]byte, filesize)
                _, err = fd.Read(brokenJSON)
                if err != nil {
                        level.Error(logger).Log("msg", "Failed to read query log file", "err", err)
                        return
                }

                queries, queriesExist := parseBrokenJSON(brokenJSON)
                if !queriesExist {
                        return
                }
                level.Info(logger).Log("msg", "These queries didn't finish in prometheus' last run:", "queries", queries)
        }
}

type mmapedFile struct {
        f io.Closer
        m mmap.MMap
}

func (f *mmapedFile) Close() error {
        err := f.m.Unmap()
        if err != nil {
                err = fmt.Errorf("mmapedFile: unmapping: %w", err)
        }
        if fErr := f.f.Close(); fErr != nil {
                return errors.Join(fmt.Errorf("close mmapedFile.f: %w", fErr), err)
        }

        return err
}

func getMMapedFile(filename string, filesize int, logger log.Logger) ([]byte, io.Closer, error) {
        file, err := os.OpenFile(filename, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0o666)
        if err != nil {
                absPath, pathErr := filepath.Abs(filename)
                if pathErr != nil {
                        absPath = filename
                }
                level.Error(logger).Log("msg", "Error opening query log file", "file", absPath, "err", err)
                return nil, nil, err
        }

        err = file.Truncate(int64(filesize))
        if err != nil {
                file.Close()
                level.Error(logger).Log("msg", "Error setting filesize.", "filesize", filesize, "err", err)
                return nil, nil, err
        }

        fileAsBytes, err := mmap.Map(file, mmap.RDWR, 0)
        if err != nil {
                file.Close()
                level.Error(logger).Log("msg", "Failed to mmap", "file", filename, "Attempted size", filesize, "err", err)
                return nil, nil, err
        }

        return fileAsBytes, &mmapedFile{f: file, m: fileAsBytes}, err
}

func NewActiveQueryTracker(localStoragePath string, maxConcurrent int, logger log.Logger) *ActiveQueryTracker {
        err := os.MkdirAll(localStoragePath, 0o777)
        if err != nil {
                level.Error(logger).Log("msg", "Failed to create directory for logging active queries")
        }

        filename, filesize := filepath.Join(localStoragePath, "queries.active"), 1+maxConcurrent*entrySize
        logUnfinishedQueries(filename, filesize, logger)

        fileAsBytes, closer, err := getMMapedFile(filename, filesize, logger)
        if err != nil {
                panic("Unable to create mmap-ed active query log")
        }

        copy(fileAsBytes, "[")
        activeQueryTracker := ActiveQueryTracker{
                mmapedFile:    fileAsBytes,
                closer:        closer,
                getNextIndex:  make(chan int, maxConcurrent),
                logger:        logger,
                maxConcurrent: maxConcurrent,
        }

        activeQueryTracker.generateIndices(maxConcurrent)

        return &activeQueryTracker
}

func trimStringByBytes(str string, size int) string {
        bytesStr := []byte(str)

        trimIndex := len(bytesStr)
        if size < len(bytesStr) {
                for !utf8.RuneStart(bytesStr[size]) {
                        size--
                }
                trimIndex = size
        }

        return string(bytesStr[:trimIndex])
}

func _newJSONEntry(query string, timestamp int64, logger log.Logger) []byte {
        entry := Entry{query, timestamp}
        jsonEntry, err := json.Marshal(entry)
        if err != nil {
                level.Error(logger).Log("msg", "Cannot create json of query", "query", query)
                return []byte{}
        }

        return jsonEntry
}

func newJSONEntry(query string, logger log.Logger) []byte {
        timestamp := time.Now().Unix()
        minEntryJSON := _newJSONEntry("", timestamp, logger)

        query = trimStringByBytes(query, entrySize-(len(minEntryJSON)+1))
        jsonEntry := _newJSONEntry(query, timestamp, logger)

        return jsonEntry
}

func (tracker ActiveQueryTracker) generateIndices(maxConcurrent int) {
        for i := 0; i < maxConcurrent; i++ {
                tracker.getNextIndex <- 1 + (i * entrySize)
        }
}

func (tracker ActiveQueryTracker) GetMaxConcurrent() int {
        return tracker.maxConcurrent
}

func (tracker ActiveQueryTracker) Delete(insertIndex int) {
        copy(tracker.mmapedFile[insertIndex:], strings.Repeat("\x00", entrySize))
        tracker.getNextIndex <- insertIndex
}

func (tracker ActiveQueryTracker) Insert(ctx context.Context, query string) (int, error) {
        select {
        case i := <-tracker.getNextIndex:
                fileBytes := tracker.mmapedFile
                entry := newJSONEntry(query, tracker.logger)
                start, end := i, i+entrySize

                copy(fileBytes[start:], entry)
                copy(fileBytes[end-1:], ",")
                return i, nil
        case <-ctx.Done():
                return 0, ctx.Err()
        }
}

// Close closes tracker.
func (tracker *ActiveQueryTracker) Close() error {
        if tracker == nil || tracker.closer == nil {
                return nil
        }
        if err := tracker.closer.Close(); err != nil {
                return fmt.Errorf("close ActiveQueryTracker.closer: %w", err)
        }
        return nil
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package promql

import (
        "encoding/json"
        "errors"
        "fmt"
        "math"
        "strconv"
        "strings"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/promql/parser"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/util/annotations"
)

func (Matrix) Type() parser.ValueType { return parser.ValueTypeMatrix }
func (Vector) Type() parser.ValueType { return parser.ValueTypeVector }
func (Scalar) Type() parser.ValueType { return parser.ValueTypeScalar }
func (String) Type() parser.ValueType { return parser.ValueTypeString }

// String represents a string value.
type String struct {
        T int64
        V string
}

func (s String) String() string {
        return s.V
}

func (s String) MarshalJSON() ([]byte, error) {
        return json.Marshal([...]interface{}{float64(s.T) / 1000, s.V})
}

// Scalar is a data point that's explicitly not associated with a metric.
type Scalar struct {
        T int64
        V float64
}

func (s Scalar) String() string {
        v := strconv.FormatFloat(s.V, 'f', -1, 64)
        return fmt.Sprintf("scalar: %v @[%v]", v, s.T)
}

func (s Scalar) MarshalJSON() ([]byte, error) {
        v := strconv.FormatFloat(s.V, 'f', -1, 64)
        return json.Marshal([...]interface{}{float64(s.T) / 1000, v})
}

// Series is a stream of data points belonging to a metric.
type Series struct {
        Metric     labels.Labels `json:"metric"`
        Floats     []FPoint      `json:"values,omitempty"`
        Histograms []HPoint      `json:"histograms,omitempty"`
}

func (s Series) String() string {
        // TODO(beorn7): This currently renders floats first and then
        // histograms, each sorted by timestamp. Maybe, in mixed series, that's
        // fine. Maybe, however, primary sorting by timestamp is preferred, in
        // which case this has to be changed.
        vals := make([]string, 0, len(s.Floats)+len(s.Histograms))
        for _, f := range s.Floats {
                vals = append(vals, f.String())
        }
        for _, h := range s.Histograms {
                vals = append(vals, h.String())
        }
        return fmt.Sprintf("%s =>\n%s", s.Metric, strings.Join(vals, "\n"))
}

// FPoint represents a single float data point for a given timestamp.
type FPoint struct {
        T int64
        F float64
}

func (p FPoint) String() string {
        s := strconv.FormatFloat(p.F, 'f', -1, 64)
        return fmt.Sprintf("%s @[%v]", s, p.T)
}

// MarshalJSON implements json.Marshaler.
//
// JSON marshaling is only needed for the HTTP API. Since FPoint is such a
// frequently marshaled type, it gets an optimized treatment directly in
// web/api/v1/api.go. Therefore, this method is unused within Prometheus. It is
// still provided here as convenience for debugging and for other users of this
// code. Also note that the different marshaling implementations might lead to
// slightly different results in terms of formatting and rounding of the
// timestamp.
func (p FPoint) MarshalJSON() ([]byte, error) {
        v := strconv.FormatFloat(p.F, 'f', -1, 64)
        return json.Marshal([...]interface{}{float64(p.T) / 1000, v})
}

// HPoint represents a single histogram data point for a given timestamp.
// H must never be nil.
type HPoint struct {
        T int64
        H *histogram.FloatHistogram
}

func (p HPoint) String() string {
        return fmt.Sprintf("%s @[%v]", p.H.String(), p.T)
}

// MarshalJSON implements json.Marshaler.
//
// JSON marshaling is only needed for the HTTP API. Since HPoint is such a
// frequently marshaled type, it gets an optimized treatment directly in
// web/api/v1/api.go. Therefore, this method is unused within Prometheus. It is
// still provided here as convenience for debugging and for other users of this
// code. Also note that the different marshaling implementations might lead to
// slightly different results in terms of formatting and rounding of the
// timestamp.
func (p HPoint) MarshalJSON() ([]byte, error) {
        h := struct {
                Count   string          `json:"count"`
                Sum     string          `json:"sum"`
                Buckets [][]interface{} `json:"buckets,omitempty"`
        }{
                Count: strconv.FormatFloat(p.H.Count, 'f', -1, 64),
                Sum:   strconv.FormatFloat(p.H.Sum, 'f', -1, 64),
        }
        it := p.H.AllBucketIterator()
        for it.Next() {
                bucket := it.At()
                if bucket.Count == 0 {
                        continue // No need to expose empty buckets in JSON.
                }
                boundaries := 2 // Exclusive on both sides AKA open interval.
                if bucket.LowerInclusive {
                        if bucket.UpperInclusive {
                                boundaries = 3 // Inclusive on both sides AKA closed interval.
                        } else {
                                boundaries = 1 // Inclusive only on lower end AKA right open.
                        }
                } else {
                        if bucket.UpperInclusive {
                                boundaries = 0 // Inclusive only on upper end AKA left open.
                        }
                }
                bucketToMarshal := []interface{}{
                        boundaries,
                        strconv.FormatFloat(bucket.Lower, 'f', -1, 64),
                        strconv.FormatFloat(bucket.Upper, 'f', -1, 64),
                        strconv.FormatFloat(bucket.Count, 'f', -1, 64),
                }
                h.Buckets = append(h.Buckets, bucketToMarshal)
        }
        return json.Marshal([...]interface{}{float64(p.T) / 1000, h})
}

// size returns the size of the HPoint compared to the size of an FPoint.
// The total size is calculated considering the histogram timestamp (p.T - 8 bytes),
// and then a number of bytes in the histogram.
// This sum is divided by 16, as samples are 16 bytes.
func (p HPoint) size() int {
        return (p.H.Size() + 8) / 16
}

// totalHPointSize returns the total number of samples in the given slice of HPoints.
func totalHPointSize(histograms []HPoint) int {
        var total int
        for _, h := range histograms {
                total += h.size()
        }
        return total
}

// Sample is a single sample belonging to a metric. It represents either a float
// sample or a histogram sample. If H is nil, it is a float sample. Otherwise,
// it is a histogram sample.
type Sample struct {
        T int64
        F float64
        H *histogram.FloatHistogram

        Metric labels.Labels
}

func (s Sample) String() string {
        var str string
        if s.H == nil {
                p := FPoint{T: s.T, F: s.F}
                str = p.String()
        } else {
                p := HPoint{T: s.T, H: s.H}
                str = p.String()
        }
        return fmt.Sprintf("%s => %s", s.Metric, str)
}

// MarshalJSON is mirrored in web/api/v1/api.go with jsoniter because FPoint and
// HPoint wouldn't be marshaled with jsoniter otherwise.
func (s Sample) MarshalJSON() ([]byte, error) {
        if s.H == nil {
                f := struct {
                        M labels.Labels `json:"metric"`
                        F FPoint        `json:"value"`
                }{
                        M: s.Metric,
                        F: FPoint{T: s.T, F: s.F},
                }
                return json.Marshal(f)
        }
        h := struct {
                M labels.Labels `json:"metric"`
                H HPoint        `json:"histogram"`
        }{
                M: s.Metric,
                H: HPoint{T: s.T, H: s.H},
        }
        return json.Marshal(h)
}

// Vector is basically only an alias for []Sample, but the contract is that
// in a Vector, all Samples have the same timestamp.
type Vector []Sample

func (vec Vector) String() string {
        entries := make([]string, len(vec))
        for i, s := range vec {
                entries[i] = s.String()
        }
        return strings.Join(entries, "\n")
}

// TotalSamples returns the total number of samples in the series within a vector.
// Float samples have a weight of 1 in this number, while histogram samples have a higher
// weight according to their size compared with the size of a float sample.
// See HPoint.size for details.
func (vec Vector) TotalSamples() int {
        numSamples := 0
        for _, sample := range vec {
                numSamples++
                if sample.H != nil {
                        numSamples += sample.H.Size() / 16
                }
        }
        return numSamples
}

// ContainsSameLabelset checks if a vector has samples with the same labelset
// Such a behavior is semantically undefined
// https://github.com/prometheus/prometheus/issues/4562
func (vec Vector) ContainsSameLabelset() bool {
        switch len(vec) {
        case 0, 1:
                return false
        case 2:
                return vec[0].Metric.Hash() == vec[1].Metric.Hash()
        default:
                l := make(map[uint64]struct{}, len(vec))
                for _, ss := range vec {
                        hash := ss.Metric.Hash()
                        if _, ok := l[hash]; ok {
                                return true
                        }
                        l[hash] = struct{}{}
                }
                return false
        }
}

// Matrix is a slice of Series that implements sort.Interface and
// has a String method.
type Matrix []Series

func (m Matrix) String() string {
        // TODO(fabxc): sort, or can we rely on order from the querier?
        strs := make([]string, len(m))

        for i, ss := range m {
                strs[i] = ss.String()
        }

        return strings.Join(strs, "\n")
}

// TotalSamples returns the total number of samples in the series within a matrix.
// Float samples have a weight of 1 in this number, while histogram samples have a higher
// weight according to their size compared with the size of a float sample.
// See HPoint.size for details.
func (m Matrix) TotalSamples() int {
        numSamples := 0
        for _, series := range m {
                numSamples += len(series.Floats) + totalHPointSize(series.Histograms)
        }
        return numSamples
}

func (m Matrix) Len() int           { return len(m) }
func (m Matrix) Less(i, j int) bool { return labels.Compare(m[i].Metric, m[j].Metric) < 0 }
func (m Matrix) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }

// ContainsSameLabelset checks if a matrix has samples with the same labelset.
// Such a behavior is semantically undefined.
// https://github.com/prometheus/prometheus/issues/4562
func (m Matrix) ContainsSameLabelset() bool {
        switch len(m) {
        case 0, 1:
                return false
        case 2:
                return m[0].Metric.Hash() == m[1].Metric.Hash()
        default:
                l := make(map[uint64]struct{}, len(m))
                for _, ss := range m {
                        hash := ss.Metric.Hash()
                        if _, ok := l[hash]; ok {
                                return true
                        }
                        l[hash] = struct{}{}
                }
                return false
        }
}

// Result holds the resulting value of an execution or an error
// if any occurred.
type Result struct {
        Err      error
        Value    parser.Value
        Warnings annotations.Annotations
}

// Vector returns a Vector if the result value is one. An error is returned if
// the result was an error or the result value is not a Vector.
func (r *Result) Vector() (Vector, error) {
        if r.Err != nil {
                return nil, r.Err
        }
        v, ok := r.Value.(Vector)
        if !ok {
                return nil, errors.New("query result is not a Vector")
        }
        return v, nil
}

// Matrix returns a Matrix. An error is returned if
// the result was an error or the result value is not a Matrix.
func (r *Result) Matrix() (Matrix, error) {
        if r.Err != nil {
                return nil, r.Err
        }
        v, ok := r.Value.(Matrix)
        if !ok {
                return nil, errors.New("query result is not a range Vector")
        }
        return v, nil
}

// Scalar returns a Scalar value. An error is returned if
// the result was an error or the result value is not a Scalar.
func (r *Result) Scalar() (Scalar, error) {
        if r.Err != nil {
                return Scalar{}, r.Err
        }
        v, ok := r.Value.(Scalar)
        if !ok {
                return Scalar{}, errors.New("query result is not a Scalar")
        }
        return v, nil
}

func (r *Result) String() string {
        if r.Err != nil {
                return r.Err.Error()
        }
        if r.Value == nil {
                return ""
        }
        return r.Value.String()
}

// StorageSeries simulates promql.Series as storage.Series.
type StorageSeries struct {
        series Series
}

// NewStorageSeries returns a StorageSeries from a Series.
func NewStorageSeries(series Series) *StorageSeries {
        return &StorageSeries{
                series: series,
        }
}

func (ss *StorageSeries) Labels() labels.Labels {
        return ss.series.Metric
}

// Iterator returns a new iterator of the data of the series. In case of
// multiple samples with the same timestamp, it returns the float samples first.
func (ss *StorageSeries) Iterator(it chunkenc.Iterator) chunkenc.Iterator {
        if ssi, ok := it.(*storageSeriesIterator); ok {
                ssi.reset(ss.series)
                return ssi
        }
        return newStorageSeriesIterator(ss.series)
}

type storageSeriesIterator struct {
        floats               []FPoint
        histograms           []HPoint
        iFloats, iHistograms int
        currT                int64
        currF                float64
        currH                *histogram.FloatHistogram
}

func newStorageSeriesIterator(series Series) *storageSeriesIterator {
        return &storageSeriesIterator{
                floats:      series.Floats,
                histograms:  series.Histograms,
                iFloats:     -1,
                iHistograms: 0,
                currT:       math.MinInt64,
        }
}

func (ssi *storageSeriesIterator) reset(series Series) {
        ssi.floats = series.Floats
        ssi.histograms = series.Histograms
        ssi.iFloats = -1
        ssi.iHistograms = 0
        ssi.currT = math.MinInt64
        ssi.currF = 0
        ssi.currH = nil
}

func (ssi *storageSeriesIterator) Seek(t int64) chunkenc.ValueType {
        if ssi.iFloats >= len(ssi.floats) && ssi.iHistograms >= len(ssi.histograms) {
                return chunkenc.ValNone
        }
        for ssi.currT < t {
                if ssi.Next() == chunkenc.ValNone {
                        return chunkenc.ValNone
                }
        }
        if ssi.currH != nil {
                return chunkenc.ValFloatHistogram
        }
        return chunkenc.ValFloat
}

func (ssi *storageSeriesIterator) At() (t int64, v float64) {
        return ssi.currT, ssi.currF
}

func (ssi *storageSeriesIterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) {
        panic(errors.New("storageSeriesIterator: AtHistogram not supported"))
}

func (ssi *storageSeriesIterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        return ssi.currT, ssi.currH
}

func (ssi *storageSeriesIterator) AtT() int64 {
        return ssi.currT
}

func (ssi *storageSeriesIterator) Next() chunkenc.ValueType {
        if ssi.currH != nil {
                ssi.iHistograms++
        } else {
                ssi.iFloats++
        }
        var (
                pickH, pickF        = false, false
                floatsExhausted     = ssi.iFloats >= len(ssi.floats)
                histogramsExhausted = ssi.iHistograms >= len(ssi.histograms)
        )

        switch {
        case floatsExhausted:
                if histogramsExhausted { // Both exhausted!
                        return chunkenc.ValNone
                }
                pickH = true
        case histogramsExhausted: // and floats not exhausted.
                pickF = true
        // From here on, we have to look at timestamps.
        case ssi.histograms[ssi.iHistograms].T < ssi.floats[ssi.iFloats].T:
                // Next histogram comes before next float.
                pickH = true
        default:
                // In all other cases, we pick float so that we first iterate
                // through floats if the timestamp is the same.
                pickF = true
        }

        switch {
        case pickF:
                p := ssi.floats[ssi.iFloats]
                ssi.currT = p.T
                ssi.currF = p.F
                ssi.currH = nil
                return chunkenc.ValFloat
        case pickH:
                p := ssi.histograms[ssi.iHistograms]
                ssi.currT = p.T
                ssi.currF = 0
                ssi.currH = p.H
                return chunkenc.ValFloatHistogram
        default:
                panic("storageSeriesIterater.Next failed to pick value type")
        }
}

func (ssi *storageSeriesIterator) Err() error {
        return nil
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
        "fmt"
        "math"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
)

// BufferedSeriesIterator wraps an iterator with a look-back buffer.
type BufferedSeriesIterator struct {
        hReader  histogram.Histogram
        fhReader histogram.FloatHistogram

        it    chunkenc.Iterator
        buf   *sampleRing
        delta int64

        lastTime  int64
        valueType chunkenc.ValueType
}

// NewBuffer returns a new iterator that buffers the values within the time range
// of the current element and the duration of delta before, initialized with an
// empty iterator. Use Reset() to set an actual iterator to be buffered.
func NewBuffer(delta int64) *BufferedSeriesIterator {
        return NewBufferIterator(chunkenc.NewNopIterator(), delta)
}

// NewBufferIterator returns a new iterator that buffers the values within the
// time range of the current element and the duration of delta before.
func NewBufferIterator(it chunkenc.Iterator, delta int64) *BufferedSeriesIterator {
        bit := &BufferedSeriesIterator{
                buf:   newSampleRing(delta, 0, chunkenc.ValNone),
                delta: delta,
        }
        bit.Reset(it)

        return bit
}

// Reset re-uses the buffer with a new iterator, resetting the buffered time
// delta to its original value.
func (b *BufferedSeriesIterator) Reset(it chunkenc.Iterator) {
        b.it = it
        b.lastTime = math.MinInt64
        b.buf.reset()
        b.buf.delta = b.delta
        b.valueType = it.Next()
}

// ReduceDelta lowers the buffered time delta, for the current SeriesIterator only.
func (b *BufferedSeriesIterator) ReduceDelta(delta int64) bool {
        return b.buf.reduceDelta(delta)
}

// PeekBack returns the nth previous element of the iterator. If there is none buffered,
// ok is false.
func (b *BufferedSeriesIterator) PeekBack(n int) (sample chunks.Sample, ok bool) {
        return b.buf.nthLast(n)
}

// Buffer returns an iterator over the buffered data. Invalidates previously
// returned iterators.
func (b *BufferedSeriesIterator) Buffer() *SampleRingIterator {
        return b.buf.iterator()
}

// Seek advances the iterator to the element at time t or greater.
func (b *BufferedSeriesIterator) Seek(t int64) chunkenc.ValueType {
        t0 := t - b.buf.delta

        // If the delta would cause us to seek backwards, preserve the buffer
        // and just continue regular advancement while filling the buffer on the way.
        if b.valueType != chunkenc.ValNone && t0 > b.lastTime {
                b.buf.reset()

                b.valueType = b.it.Seek(t0)
                switch b.valueType {
                case chunkenc.ValNone:
                        return chunkenc.ValNone
                case chunkenc.ValFloat, chunkenc.ValHistogram, chunkenc.ValFloatHistogram:
                        b.lastTime = b.AtT()
                default:
                        panic(fmt.Errorf("BufferedSeriesIterator: unknown value type %v", b.valueType))
                }
        }

        if b.lastTime >= t {
                return b.valueType
        }
        for {
                if b.valueType = b.Next(); b.valueType == chunkenc.ValNone || b.lastTime >= t {
                        return b.valueType
                }
        }
}

// Next advances the iterator to the next element.
func (b *BufferedSeriesIterator) Next() chunkenc.ValueType {
        // Add current element to buffer before advancing.
        switch b.valueType {
        case chunkenc.ValNone:
                return chunkenc.ValNone
        case chunkenc.ValFloat:
                t, f := b.it.At()
                b.buf.addF(fSample{t: t, f: f})
        case chunkenc.ValHistogram:
                t, h := b.it.AtHistogram(&b.hReader)
                b.buf.addH(hSample{t: t, h: h})
        case chunkenc.ValFloatHistogram:
                t, fh := b.it.AtFloatHistogram(&b.fhReader)
                b.buf.addFH(fhSample{t: t, fh: fh})
        default:
                panic(fmt.Errorf("BufferedSeriesIterator: unknown value type %v", b.valueType))
        }

        b.valueType = b.it.Next()
        if b.valueType != chunkenc.ValNone {
                b.lastTime = b.AtT()
        }
        return b.valueType
}

// At returns the current float element of the iterator.
func (b *BufferedSeriesIterator) At() (int64, float64) {
        return b.it.At()
}

// AtHistogram returns the current histogram element of the iterator.
func (b *BufferedSeriesIterator) AtHistogram(fh *histogram.Histogram) (int64, *histogram.Histogram) {
        return b.it.AtHistogram(fh)
}

// AtFloatHistogram returns the current float-histogram element of the iterator.
func (b *BufferedSeriesIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        return b.it.AtFloatHistogram(fh)
}

// AtT returns the current timestamp of the iterator.
func (b *BufferedSeriesIterator) AtT() int64 {
        return b.it.AtT()
}

// Err returns the last encountered error.
func (b *BufferedSeriesIterator) Err() error {
        return b.it.Err()
}

type fSample struct {
        t int64
        f float64
}

func (s fSample) T() int64 {
        return s.t
}

func (s fSample) F() float64 {
        return s.f
}

func (s fSample) H() *histogram.Histogram {
        panic("H() called for fSample")
}

func (s fSample) FH() *histogram.FloatHistogram {
        panic("FH() called for fSample")
}

func (s fSample) Type() chunkenc.ValueType {
        return chunkenc.ValFloat
}

type hSample struct {
        t int64
        h *histogram.Histogram
}

func (s hSample) T() int64 {
        return s.t
}

func (s hSample) F() float64 {
        panic("F() called for hSample")
}

func (s hSample) H() *histogram.Histogram {
        return s.h
}

func (s hSample) FH() *histogram.FloatHistogram {
        return s.h.ToFloat(nil)
}

func (s hSample) Type() chunkenc.ValueType {
        return chunkenc.ValHistogram
}

type fhSample struct {
        t  int64
        fh *histogram.FloatHistogram
}

func (s fhSample) T() int64 {
        return s.t
}

func (s fhSample) F() float64 {
        panic("F() called for fhSample")
}

func (s fhSample) H() *histogram.Histogram {
        panic("H() called for fhSample")
}

func (s fhSample) FH() *histogram.FloatHistogram {
        return s.fh
}

func (s fhSample) Type() chunkenc.ValueType {
        return chunkenc.ValFloatHistogram
}

type sampleRing struct {
        delta int64

        // Lookback buffers. We use iBuf for mixed samples, but one of the three
        // concrete ones for homogenous samples. (Only one of the four bufs is
        // allowed to be populated!) This avoids the overhead of the interface
        // wrapper for the happy (and by far most common) case of homogenous
        // samples.
        iBuf     []chunks.Sample
        fBuf     []fSample
        hBuf     []hSample
        fhBuf    []fhSample
        bufInUse bufType

        i int // Position of most recent element in ring buffer.
        f int // Position of first element in ring buffer.
        l int // Number of elements in buffer.

        it SampleRingIterator
}

type bufType int

const (
        noBuf bufType = iota // Nothing yet stored in sampleRing.
        iBuf
        fBuf
        hBuf
        fhBuf
)

// newSampleRing creates a new sampleRing. If you do not know the prefereed
// value type yet, use a size of 0 (in which case the provided typ doesn't
// matter). On the first add, a buffer of size 16 will be allocated with the
// preferred type being the type of the first added sample.
func newSampleRing(delta int64, size int, typ chunkenc.ValueType) *sampleRing {
        r := &sampleRing{delta: delta}
        r.reset()
        if size <= 0 {
                // Will initialize on first add.
                return r
        }
        switch typ {
        case chunkenc.ValFloat:
                r.fBuf = make([]fSample, size)
        case chunkenc.ValHistogram:
                r.hBuf = make([]hSample, size)
        case chunkenc.ValFloatHistogram:
                r.fhBuf = make([]fhSample, size)
        default:
                // Do not initialize anything because the 1st sample will be
                // added to one of the other bufs anyway.
        }
        return r
}

func (r *sampleRing) reset() {
        r.l = 0
        r.i = -1
        r.f = 0
        r.bufInUse = noBuf

        // The first sample after the reset will always go to a specialized
        // buffer. If we later need to change to the interface buffer, we'll
        // copy from the specialized buffer to the interface buffer. For that to
        // work properly, we have to reset the interface buffer here, too.
        r.iBuf = r.iBuf[:0]
}

// Resets and returns the iterator. Invalidates previously returned iterators.
func (r *sampleRing) iterator() *SampleRingIterator {
        r.it.reset(r)
        return &r.it
}

// SampleRingIterator is returned by BufferedSeriesIterator.Buffer() and can be
// used to iterate samples buffered in the lookback window.
type SampleRingIterator struct {
        r  *sampleRing
        i  int
        t  int64
        f  float64
        h  *histogram.Histogram
        fh *histogram.FloatHistogram
}

func (it *SampleRingIterator) reset(r *sampleRing) {
        it.r = r
        it.i = -1
        it.h = nil
        it.fh = nil
}

func (it *SampleRingIterator) Next() chunkenc.ValueType {
        it.i++
        if it.i >= it.r.l {
                return chunkenc.ValNone
        }
        switch it.r.bufInUse {
        case fBuf:
                s := it.r.atF(it.i)
                it.t = s.t
                it.f = s.f
                return chunkenc.ValFloat
        case hBuf:
                s := it.r.atH(it.i)
                it.t = s.t
                it.h = s.h
                return chunkenc.ValHistogram
        case fhBuf:
                s := it.r.atFH(it.i)
                it.t = s.t
                it.fh = s.fh
                return chunkenc.ValFloatHistogram
        }
        s := it.r.at(it.i)
        it.t = s.T()
        switch s.Type() {
        case chunkenc.ValHistogram:
                it.h = s.H()
                it.fh = nil
                return chunkenc.ValHistogram
        case chunkenc.ValFloatHistogram:
                it.fh = s.FH()
                it.h = nil
                return chunkenc.ValFloatHistogram
        default:
                it.f = s.F()
                return chunkenc.ValFloat
        }
}

// At returns the current float element of the iterator.
func (it *SampleRingIterator) At() (int64, float64) {
        return it.t, it.f
}

// AtHistogram returns the current histogram element of the iterator.
func (it *SampleRingIterator) AtHistogram() (int64, *histogram.Histogram) {
        return it.t, it.h
}

// AtFloatHistogram returns the current histogram element of the iterator. If the
// current sample is an integer histogram, it will be converted to a float histogram.
// An optional histogram.FloatHistogram can be provided to avoid allocating a new
// object for the conversion.
func (it *SampleRingIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        if it.fh == nil {
                return it.t, it.h.ToFloat(fh)
        }
        if fh != nil {
                it.fh.CopyTo(fh)
                return it.t, fh
        }
        return it.t, it.fh.Copy()
}

func (it *SampleRingIterator) AtT() int64 {
        return it.t
}

func (r *sampleRing) at(i int) chunks.Sample {
        j := (r.f + i) % len(r.iBuf)
        return r.iBuf[j]
}

func (r *sampleRing) atF(i int) fSample {
        j := (r.f + i) % len(r.fBuf)
        return r.fBuf[j]
}

func (r *sampleRing) atH(i int) hSample {
        j := (r.f + i) % len(r.hBuf)
        return r.hBuf[j]
}

func (r *sampleRing) atFH(i int) fhSample {
        j := (r.f + i) % len(r.fhBuf)
        return r.fhBuf[j]
}

// add adds a sample to the ring buffer and frees all samples that fall out of
// the delta range. Note that this method works for any sample
// implementation. If you know you are dealing with one of the implementations
// from this package (fSample, hSample, fhSample), call one of the specialized
// methods addF, addH, or addFH for better performance.
func (r *sampleRing) add(s chunks.Sample) {
        if r.bufInUse == noBuf {
                // First sample.
                switch s := s.(type) {
                case fSample:
                        r.bufInUse = fBuf
                        r.fBuf = addF(s, r.fBuf, r)
                case hSample:
                        r.bufInUse = hBuf
                        r.hBuf = addH(s, r.hBuf, r)
                case fhSample:
                        r.bufInUse = fhBuf
                        r.fhBuf = addFH(s, r.fhBuf, r)
                }
                return
        }
        if r.bufInUse != iBuf {
                // Nothing added to the interface buf yet. Let's check if we can
                // stay specialized.
                switch s := s.(type) {
                case fSample:
                        if r.bufInUse == fBuf {
                                r.fBuf = addF(s, r.fBuf, r)
                                return
                        }
                case hSample:
                        if r.bufInUse == hBuf {
                                r.hBuf = addH(s, r.hBuf, r)
                                return
                        }
                case fhSample:
                        if r.bufInUse == fhBuf {
                                r.fhBuf = addFH(s, r.fhBuf, r)
                                return
                        }
                }
                // The new sample isn't a fit for the already existing
                // ones. Copy the latter into the interface buffer where needed.
                // The interface buffer is assumed to be of length zero at this point.
                switch r.bufInUse {
                case fBuf:
                        for _, s := range r.fBuf {
                                r.iBuf = append(r.iBuf, s)
                        }
                        r.fBuf = nil
                case hBuf:
                        for _, s := range r.hBuf {
                                r.iBuf = append(r.iBuf, s)
                        }
                        r.hBuf = nil
                case fhBuf:
                        for _, s := range r.fhBuf {
                                r.iBuf = append(r.iBuf, s)
                        }
                        r.fhBuf = nil
                }
                r.bufInUse = iBuf
        }
        r.iBuf = addSample(s, r.iBuf, r)
}

// addF is a version of the add method specialized for fSample.
func (r *sampleRing) addF(s fSample) {
        switch r.bufInUse {
        case fBuf: // Add to existing fSamples.
                r.fBuf = addF(s, r.fBuf, r)
        case noBuf: // Add first sample.
                r.fBuf = addF(s, r.fBuf, r)
                r.bufInUse = fBuf
        case iBuf: // Already have interface samples. Add to the interface buf.
                r.iBuf = addSample(s, r.iBuf, r)
        default:
                // Already have specialized samples that are not fSamples.
                // Need to call the checked add method for conversion.
                r.add(s)
        }
}

// addH is a version of the add method specialized for hSample.
func (r *sampleRing) addH(s hSample) {
        switch r.bufInUse {
        case hBuf: // Add to existing hSamples.
                r.hBuf = addH(s, r.hBuf, r)
        case noBuf: // Add first sample.
                r.hBuf = addH(s, r.hBuf, r)
                r.bufInUse = hBuf
        case iBuf: // Already have interface samples. Add to the interface buf.
                r.iBuf = addSample(s, r.iBuf, r)
        default:
                // Already have specialized samples that are not hSamples.
                // Need to call the checked add method for conversion.
                r.add(s)
        }
}

// addFH is a version of the add method specialized for fhSample.
func (r *sampleRing) addFH(s fhSample) {
        switch r.bufInUse {
        case fhBuf: // Add to existing fhSamples.
                r.fhBuf = addFH(s, r.fhBuf, r)
        case noBuf: // Add first sample.
                r.fhBuf = addFH(s, r.fhBuf, r)
                r.bufInUse = fhBuf
        case iBuf: // Already have interface samples. Add to the interface buf.
                r.iBuf = addSample(s, r.iBuf, r)
        default:
                // Already have specialized samples that are not fhSamples.
                // Need to call the checked add method for conversion.
                r.add(s)
        }
}

// genericAdd is a generic implementation of adding a chunks.Sample
// implementation to a buffer of a sample ring. However, the Go compiler
// currently (go1.20) decides to not expand the code during compile time, but
// creates dynamic code to handle the different types. That has a significant
// overhead during runtime, noticeable in PromQL benchmarks. For example, the
// "RangeQuery/expr=rate(a_hundred[1d]),steps=.*" benchmarks show about 7%
// longer runtime, 9% higher allocation size, and 10% more allocations.
// Therefore, genericAdd has been manually implemented for all the types
// (addSample, addF, addH, addFH) below.
//
// func genericAdd[T chunks.Sample](s T, buf []T, r *sampleRing) []T {
//         l := len(buf)
//         // Grow the ring buffer if it fits no more elements.
//         if l == 0 {
//                 buf = make([]T, 16)
//                 l = 16
//         }
//         if l == r.l {
//                 newBuf := make([]T, 2*l)
//                 copy(newBuf[l+r.f:], buf[r.f:])
//                 copy(newBuf, buf[:r.f])
//
//                 buf = newBuf
//                 r.i = r.f
//                 r.f += l
//                 l = 2 * l
//         } else {
//                 r.i++
//                 if r.i >= l {
//                         r.i -= l
//                 }
//         }
//
//         buf[r.i] = s
//         r.l++
//
//         // Free head of the buffer of samples that just fell out of the range.
//         tmin := s.T() - r.delta
//         for buf[r.f].T() < tmin {
//                 r.f++
//                 if r.f >= l {
//                         r.f -= l
//                 }
//                 r.l--
//         }
//         return buf
// }

// addSample is a handcoded specialization of genericAdd (see above).
func addSample(s chunks.Sample, buf []chunks.Sample, r *sampleRing) []chunks.Sample {
        l := len(buf)
        // Grow the ring buffer if it fits no more elements.
        if l == 0 {
                buf = make([]chunks.Sample, 16)
                l = 16
        }
        if l == r.l {
                newBuf := make([]chunks.Sample, 2*l)
                copy(newBuf[l+r.f:], buf[r.f:])
                copy(newBuf, buf[:r.f])

                buf = newBuf
                r.i = r.f
                r.f += l
                l = 2 * l
        } else {
                r.i++
                if r.i >= l {
                        r.i -= l
                }
        }

        buf[r.i] = s
        r.l++

        // Free head of the buffer of samples that just fell out of the range.
        tmin := s.T() - r.delta
        for buf[r.f].T() < tmin {
                r.f++
                if r.f >= l {
                        r.f -= l
                }
                r.l--
        }
        return buf
}

// addF is a handcoded specialization of genericAdd (see above).
func addF(s fSample, buf []fSample, r *sampleRing) []fSample {
        l := len(buf)
        // Grow the ring buffer if it fits no more elements.
        if l == 0 {
                buf = make([]fSample, 16)
                l = 16
        }
        if l == r.l {
                newBuf := make([]fSample, 2*l)
                copy(newBuf[l+r.f:], buf[r.f:])
                copy(newBuf, buf[:r.f])

                buf = newBuf
                r.i = r.f
                r.f += l
                l = 2 * l
        } else {
                r.i++
                if r.i >= l {
                        r.i -= l
                }
        }

        buf[r.i] = s
        r.l++

        // Free head of the buffer of samples that just fell out of the range.
        tmin := s.T() - r.delta
        for buf[r.f].T() < tmin {
                r.f++
                if r.f >= l {
                        r.f -= l
                }
                r.l--
        }
        return buf
}

// addH is a handcoded specialization of genericAdd (see above).
func addH(s hSample, buf []hSample, r *sampleRing) []hSample {
        l := len(buf)
        // Grow the ring buffer if it fits no more elements.
        if l == 0 {
                buf = make([]hSample, 16)
                l = 16
        }
        if l == r.l {
                newBuf := make([]hSample, 2*l)
                copy(newBuf[l+r.f:], buf[r.f:])
                copy(newBuf, buf[:r.f])

                buf = newBuf
                r.i = r.f
                r.f += l
                l = 2 * l
        } else {
                r.i++
                if r.i >= l {
                        r.i -= l
                }
        }

        buf[r.i].t = s.t
        if buf[r.i].h == nil {
                buf[r.i].h = s.h.Copy()
        } else {
                s.h.CopyTo(buf[r.i].h)
        }
        r.l++

        // Free head of the buffer of samples that just fell out of the range.
        tmin := s.T() - r.delta
        for buf[r.f].T() < tmin {
                r.f++
                if r.f >= l {
                        r.f -= l
                }
                r.l--
        }
        return buf
}

// addFH is a handcoded specialization of genericAdd (see above).
func addFH(s fhSample, buf []fhSample, r *sampleRing) []fhSample {
        l := len(buf)
        // Grow the ring buffer if it fits no more elements.
        if l == 0 {
                buf = make([]fhSample, 16)
                l = 16
        }
        if l == r.l {
                newBuf := make([]fhSample, 2*l)
                copy(newBuf[l+r.f:], buf[r.f:])
                copy(newBuf, buf[:r.f])

                buf = newBuf
                r.i = r.f
                r.f += l
                l = 2 * l
        } else {
                r.i++
                if r.i >= l {
                        r.i -= l
                }
        }

        buf[r.i].t = s.t
        if buf[r.i].fh == nil {
                buf[r.i].fh = s.fh.Copy()
        } else {
                s.fh.CopyTo(buf[r.i].fh)
        }
        r.l++

        // Free head of the buffer of samples that just fell out of the range.
        tmin := s.T() - r.delta
        for buf[r.f].T() < tmin {
                r.f++
                if r.f >= l {
                        r.f -= l
                }
                r.l--
        }
        return buf
}

// reduceDelta lowers the buffered time delta, dropping any samples that are
// out of the new delta range.
func (r *sampleRing) reduceDelta(delta int64) bool {
        if delta > r.delta {
                return false
        }
        r.delta = delta

        if r.l == 0 {
                return true
        }

        switch r.bufInUse {
        case fBuf:
                genericReduceDelta(r.fBuf, r)
        case hBuf:
                genericReduceDelta(r.hBuf, r)
        case fhBuf:
                genericReduceDelta(r.fhBuf, r)
        default:
                genericReduceDelta(r.iBuf, r)
        }
        return true
}

func genericReduceDelta[T chunks.Sample](buf []T, r *sampleRing) {
        // Free head of the buffer of samples that just fell out of the range.
        l := len(buf)
        tmin := buf[r.i].T() - r.delta
        for buf[r.f].T() < tmin {
                r.f++
                if r.f >= l {
                        r.f -= l
                }
                r.l--
        }
}

// nthLast returns the nth most recent element added to the ring.
func (r *sampleRing) nthLast(n int) (chunks.Sample, bool) {
        if n > r.l {
                return fSample{}, false
        }
        i := r.l - n
        switch r.bufInUse {
        case fBuf:
                return r.atF(i), true
        case hBuf:
                return r.atH(i), true
        case fhBuf:
                return r.atFH(i), true
        default:
                return r.at(i), true
        }
}

func (r *sampleRing) samples() []chunks.Sample {
        res := make([]chunks.Sample, r.l)

        k := r.f + r.l
        var j int

        switch r.bufInUse {
        case iBuf:
                if k > len(r.iBuf) {
                        k = len(r.iBuf)
                        j = r.l - k + r.f
                }
                n := copy(res, r.iBuf[r.f:k])
                copy(res[n:], r.iBuf[:j])
        case fBuf:
                if k > len(r.fBuf) {
                        k = len(r.fBuf)
                        j = r.l - k + r.f
                }
                resF := make([]fSample, r.l)
                n := copy(resF, r.fBuf[r.f:k])
                copy(resF[n:], r.fBuf[:j])
                for i, s := range resF {
                        res[i] = s
                }
        case hBuf:
                if k > len(r.hBuf) {
                        k = len(r.hBuf)
                        j = r.l - k + r.f
                }
                resH := make([]hSample, r.l)
                n := copy(resH, r.hBuf[r.f:k])
                copy(resH[n:], r.hBuf[:j])
                for i, s := range resH {
                        res[i] = s
                }
        case fhBuf:
                if k > len(r.fhBuf) {
                        k = len(r.fhBuf)
                        j = r.l - k + r.f
                }
                resFH := make([]fhSample, r.l)
                n := copy(resFH, r.fhBuf[r.f:k])
                copy(resFH[n:], r.fhBuf[:j])
                for i, s := range resFH {
                        res[i] = s
                }
        }

        return res
}

// Copyright 2014 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import "fmt"

type errDuplicateSampleForTimestamp struct {
        timestamp int64
        existing  float64
        newValue  float64
}

func NewDuplicateFloatErr(t int64, existing, newValue float64) error {
        return errDuplicateSampleForTimestamp{
                timestamp: t,
                existing:  existing,
                newValue:  newValue,
        }
}

func (e errDuplicateSampleForTimestamp) Error() string {
        if e.timestamp == 0 {
                return "duplicate sample for timestamp"
        }
        return fmt.Sprintf("duplicate sample for timestamp %d; overrides not allowed: existing %g, new value %g", e.timestamp, e.existing, e.newValue)
}

// Every errDuplicateSampleForTimestamp compares equal to the global ErrDuplicateSampleForTimestamp.
func (e errDuplicateSampleForTimestamp) Is(t error) bool {
        if t == ErrDuplicateSampleForTimestamp {
                return true
        }
        if v, ok := t.(errDuplicateSampleForTimestamp); ok {
                return e == v
        }
        return false
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
        "context"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/metadata"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
)

type fanout struct {
        logger log.Logger

        primary     Storage
        secondaries []Storage
}

// NewFanout returns a new fanout Storage, which proxies reads and writes
// through to multiple underlying storages.
//
// The difference between primary and secondary Storage is only for read (Querier) path and it goes as follows:
// * If the primary querier returns an error, then any of the Querier operations will fail.
// * If any secondary querier returns an error the result from that queries is discarded. The overall operation will succeed,
// and the error from the secondary querier will be returned as a warning.
//
// NOTE: In the case of Prometheus, it treats all remote storages as secondary / best effort.
func NewFanout(logger log.Logger, primary Storage, secondaries ...Storage) Storage {
        return &fanout{
                logger:      logger,
                primary:     primary,
                secondaries: secondaries,
        }
}

// StartTime implements the Storage interface.
func (f *fanout) StartTime() (int64, error) {
        // StartTime of a fanout should be the earliest StartTime of all its storages,
        // both primary and secondaries.
        firstTime, err := f.primary.StartTime()
        if err != nil {
                return int64(model.Latest), err
        }

        for _, s := range f.secondaries {
                t, err := s.StartTime()
                if err != nil {
                        return int64(model.Latest), err
                }
                if t < firstTime {
                        firstTime = t
                }
        }
        return firstTime, nil
}

func (f *fanout) Querier(mint, maxt int64) (Querier, error) {
        primary, err := f.primary.Querier(mint, maxt)
        if err != nil {
                return nil, err
        }

        secondaries := make([]Querier, 0, len(f.secondaries))
        for _, storage := range f.secondaries {
                querier, err := storage.Querier(mint, maxt)
                if err != nil {
                        // Close already open Queriers, append potential errors to returned error.
                        errs := tsdb_errors.NewMulti(err, primary.Close())
                        for _, q := range secondaries {
                                errs.Add(q.Close())
                        }
                        return nil, errs.Err()
                }
                if _, ok := querier.(noopQuerier); !ok {
                        secondaries = append(secondaries, querier)
                }
        }
        return NewMergeQuerier([]Querier{primary}, secondaries, ChainedSeriesMerge), nil
}

func (f *fanout) ChunkQuerier(mint, maxt int64) (ChunkQuerier, error) {
        primary, err := f.primary.ChunkQuerier(mint, maxt)
        if err != nil {
                return nil, err
        }

        secondaries := make([]ChunkQuerier, 0, len(f.secondaries))
        for _, storage := range f.secondaries {
                querier, err := storage.ChunkQuerier(mint, maxt)
                if err != nil {
                        // Close already open Queriers, append potential errors to returned error.
                        errs := tsdb_errors.NewMulti(err, primary.Close())
                        for _, q := range secondaries {
                                errs.Add(q.Close())
                        }
                        return nil, errs.Err()
                }
                secondaries = append(secondaries, querier)
        }
        return NewMergeChunkQuerier([]ChunkQuerier{primary}, secondaries, NewCompactingChunkSeriesMerger(ChainedSeriesMerge)), nil
}

func (f *fanout) Appender(ctx context.Context) Appender {
        primary := f.primary.Appender(ctx)
        secondaries := make([]Appender, 0, len(f.secondaries))
        for _, storage := range f.secondaries {
                secondaries = append(secondaries, storage.Appender(ctx))
        }
        return &fanoutAppender{
                logger:      f.logger,
                primary:     primary,
                secondaries: secondaries,
        }
}

// Close closes the storage and all its underlying resources.
func (f *fanout) Close() error {
        errs := tsdb_errors.NewMulti(f.primary.Close())
        for _, s := range f.secondaries {
                errs.Add(s.Close())
        }
        return errs.Err()
}

// fanoutAppender implements Appender.
type fanoutAppender struct {
        logger log.Logger

        primary     Appender
        secondaries []Appender
}

func (f *fanoutAppender) Append(ref SeriesRef, l labels.Labels, t int64, v float64) (SeriesRef, error) {
        ref, err := f.primary.Append(ref, l, t, v)
        if err != nil {
                return ref, err
        }

        for _, appender := range f.secondaries {
                if _, err := appender.Append(ref, l, t, v); err != nil {
                        return 0, err
                }
        }
        return ref, nil
}

func (f *fanoutAppender) AppendExemplar(ref SeriesRef, l labels.Labels, e exemplar.Exemplar) (SeriesRef, error) {
        ref, err := f.primary.AppendExemplar(ref, l, e)
        if err != nil {
                return ref, err
        }

        for _, appender := range f.secondaries {
                if _, err := appender.AppendExemplar(ref, l, e); err != nil {
                        return 0, err
                }
        }
        return ref, nil
}

func (f *fanoutAppender) AppendHistogram(ref SeriesRef, l labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (SeriesRef, error) {
        ref, err := f.primary.AppendHistogram(ref, l, t, h, fh)
        if err != nil {
                return ref, err
        }

        for _, appender := range f.secondaries {
                if _, err := appender.AppendHistogram(ref, l, t, h, fh); err != nil {
                        return 0, err
                }
        }
        return ref, nil
}

func (f *fanoutAppender) UpdateMetadata(ref SeriesRef, l labels.Labels, m metadata.Metadata) (SeriesRef, error) {
        ref, err := f.primary.UpdateMetadata(ref, l, m)
        if err != nil {
                return ref, err
        }

        for _, appender := range f.secondaries {
                if _, err := appender.UpdateMetadata(ref, l, m); err != nil {
                        return 0, err
                }
        }
        return ref, nil
}

func (f *fanoutAppender) AppendCTZeroSample(ref SeriesRef, l labels.Labels, t, ct int64) (SeriesRef, error) {
        ref, err := f.primary.AppendCTZeroSample(ref, l, t, ct)
        if err != nil {
                return ref, err
        }

        for _, appender := range f.secondaries {
                if _, err := appender.AppendCTZeroSample(ref, l, t, ct); err != nil {
                        return 0, err
                }
        }
        return ref, nil
}

func (f *fanoutAppender) Commit() (err error) {
        err = f.primary.Commit()

        for _, appender := range f.secondaries {
                if err == nil {
                        err = appender.Commit()
                } else {
                        if rollbackErr := appender.Rollback(); rollbackErr != nil {
                                level.Error(f.logger).Log("msg", "Squashed rollback error on commit", "err", rollbackErr)
                        }
                }
        }
        return
}

func (f *fanoutAppender) Rollback() (err error) {
        err = f.primary.Rollback()

        for _, appender := range f.secondaries {
                rollbackErr := appender.Rollback()
                switch {
                case err == nil:
                        err = rollbackErr
                case rollbackErr != nil:
                        level.Error(f.logger).Log("msg", "Squashed rollback error on rollback", "err", rollbackErr)
                }
        }
        return nil
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// This file holds boilerplate adapters for generic MergeSeriesSet and MergeQuerier functions, so we can have one optimized
// solution that works for both ChunkSeriesSet as well as SeriesSet.

package storage

import (
        "context"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/util/annotations"
)

type genericQuerier interface {
        LabelQuerier
        Select(context.Context, bool, *SelectHints, ...*labels.Matcher) genericSeriesSet
}

type genericSeriesSet interface {
        Next() bool
        At() Labels
        Err() error
        Warnings() annotations.Annotations
}

type genericSeriesMergeFunc func(...Labels) Labels

type genericSeriesSetAdapter struct {
        SeriesSet
}

func (a *genericSeriesSetAdapter) At() Labels {
        return a.SeriesSet.At()
}

type genericChunkSeriesSetAdapter struct {
        ChunkSeriesSet
}

func (a *genericChunkSeriesSetAdapter) At() Labels {
        return a.ChunkSeriesSet.At()
}

type genericQuerierAdapter struct {
        LabelQuerier

        // One-of. If both are set, Querier will be used.
        q  Querier
        cq ChunkQuerier
}

func (q *genericQuerierAdapter) Select(ctx context.Context, sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) genericSeriesSet {
        if q.q != nil {
                return &genericSeriesSetAdapter{q.q.Select(ctx, sortSeries, hints, matchers...)}
        }
        return &genericChunkSeriesSetAdapter{q.cq.Select(ctx, sortSeries, hints, matchers...)}
}

func newGenericQuerierFrom(q Querier) genericQuerier {
        return &genericQuerierAdapter{LabelQuerier: q, q: q}
}

func newGenericQuerierFromChunk(cq ChunkQuerier) genericQuerier {
        return &genericQuerierAdapter{LabelQuerier: cq, cq: cq}
}

type querierAdapter struct {
        genericQuerier
}

type seriesSetAdapter struct {
        genericSeriesSet
}

func (a *seriesSetAdapter) At() Series {
        return a.genericSeriesSet.At().(Series)
}

func (q *querierAdapter) Select(ctx context.Context, sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) SeriesSet {
        return &seriesSetAdapter{q.genericQuerier.Select(ctx, sortSeries, hints, matchers...)}
}

type chunkQuerierAdapter struct {
        genericQuerier
}

type chunkSeriesSetAdapter struct {
        genericSeriesSet
}

func (a *chunkSeriesSetAdapter) At() ChunkSeries {
        return a.genericSeriesSet.At().(ChunkSeries)
}

func (q *chunkQuerierAdapter) Select(ctx context.Context, sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) ChunkSeriesSet {
        return &chunkSeriesSetAdapter{q.genericQuerier.Select(ctx, sortSeries, hints, matchers...)}
}

type seriesMergerAdapter struct {
        VerticalSeriesMergeFunc
}

func (a *seriesMergerAdapter) Merge(s ...Labels) Labels {
        buf := make([]Series, 0, len(s))
        for _, ser := range s {
                buf = append(buf, ser.(Series))
        }
        return a.VerticalSeriesMergeFunc(buf...)
}

type chunkSeriesMergerAdapter struct {
        VerticalChunkSeriesMergeFunc
}

func (a *chunkSeriesMergerAdapter) Merge(s ...Labels) Labels {
        buf := make([]ChunkSeries, 0, len(s))
        for _, ser := range s {
                buf = append(buf, ser.(ChunkSeries))
        }
        return a.VerticalChunkSeriesMergeFunc(buf...)
}

type noopGenericSeriesSet struct{}

func (noopGenericSeriesSet) Next() bool { return false }

func (noopGenericSeriesSet) At() Labels { return nil }

func (noopGenericSeriesSet) Err() error { return nil }

func (noopGenericSeriesSet) Warnings() annotations.Annotations { return nil }

// Copyright 2014 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
        "context"
        "errors"
        "fmt"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/metadata"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        "github.com/prometheus/prometheus/util/annotations"
)

// The errors exposed.
var (
        ErrNotFound = errors.New("not found")
        // ErrOutOfOrderSample is when out of order support is disabled and the sample is out of order.
        ErrOutOfOrderSample = errors.New("out of order sample")
        // ErrOutOfBounds is when out of order support is disabled and the sample is older than the min valid time for the append.
        ErrOutOfBounds = errors.New("out of bounds")
        // ErrTooOldSample is when out of order support is enabled but the sample is outside the time window allowed.
        ErrTooOldSample = errors.New("too old sample")
        // ErrDuplicateSampleForTimestamp is when the sample has same timestamp but different value.
        ErrDuplicateSampleForTimestamp = errDuplicateSampleForTimestamp{}
        ErrOutOfOrderExemplar          = errors.New("out of order exemplar")
        ErrDuplicateExemplar           = errors.New("duplicate exemplar")
        ErrExemplarLabelLength         = fmt.Errorf("label length for exemplar exceeds maximum of %d UTF-8 characters", exemplar.ExemplarMaxLabelSetLength)
        ErrExemplarsDisabled           = fmt.Errorf("exemplar storage is disabled or max exemplars is less than or equal to 0")
        ErrNativeHistogramsDisabled    = fmt.Errorf("native histograms are disabled")

        // ErrOutOfOrderCT indicates failed append of CT to the storage
        // due to CT being older the then newer sample.
        // NOTE(bwplotka): This can be both an instrumentation failure or commonly expected
        // behaviour, and we currently don't have a way to determine this. As a result
        // it's recommended to ignore this error for now.
        ErrOutOfOrderCT = fmt.Errorf("created timestamp out of order, ignoring")
)

// SeriesRef is a generic series reference. In prometheus it is either a
// HeadSeriesRef or BlockSeriesRef, though other implementations may have
// their own reference types.
type SeriesRef uint64

// Appendable allows creating appenders.
type Appendable interface {
        // Appender returns a new appender for the storage. The implementation
        // can choose whether or not to use the context, for deadlines or to check
        // for errors.
        Appender(ctx context.Context) Appender
}

// SampleAndChunkQueryable allows retrieving samples as well as encoded samples in form of chunks.
type SampleAndChunkQueryable interface {
        Queryable
        ChunkQueryable
}

// Storage ingests and manages samples, along with various indexes. All methods
// are goroutine-safe. Storage implements storage.Appender.
type Storage interface {
        SampleAndChunkQueryable
        Appendable

        // StartTime returns the oldest timestamp stored in the storage.
        StartTime() (int64, error)

        // Close closes the storage and all its underlying resources.
        Close() error
}

// ExemplarStorage ingests and manages exemplars, along with various indexes. All methods are
// goroutine-safe. ExemplarStorage implements storage.ExemplarAppender and storage.ExemplarQuerier.
type ExemplarStorage interface {
        ExemplarQueryable
        ExemplarAppender
}

// A Queryable handles queries against a storage.
// Use it when you need to have access to all samples without chunk encoding abstraction e.g promQL.
type Queryable interface {
        // Querier returns a new Querier on the storage.
        Querier(mint, maxt int64) (Querier, error)
}

// A MockQueryable is used for testing purposes so that a mock Querier can be used.
type MockQueryable struct {
        MockQuerier Querier
}

func (q *MockQueryable) Querier(int64, int64) (Querier, error) {
        return q.MockQuerier, nil
}

// Querier provides querying access over time series data of a fixed time range.
type Querier interface {
        LabelQuerier

        // Select returns a set of series that matches the given label matchers.
        // Caller can specify if it requires returned series to be sorted. Prefer not requiring sorting for better performance.
        // It allows passing hints that can help in optimising select, but it's up to implementation how this is used if used at all.
        Select(ctx context.Context, sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) SeriesSet
}

// MockQuerier is used for test purposes to mock the selected series that is returned.
type MockQuerier struct {
        SelectMockFunction func(sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) SeriesSet
}

func (q *MockQuerier) LabelValues(context.Context, string, ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        return nil, nil, nil
}

func (q *MockQuerier) LabelNames(context.Context, ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        return nil, nil, nil
}

func (q *MockQuerier) Close() error {
        return nil
}

func (q *MockQuerier) Select(_ context.Context, sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) SeriesSet {
        return q.SelectMockFunction(sortSeries, hints, matchers...)
}

// A ChunkQueryable handles queries against a storage.
// Use it when you need to have access to samples in encoded format.
type ChunkQueryable interface {
        // ChunkQuerier returns a new ChunkQuerier on the storage.
        ChunkQuerier(mint, maxt int64) (ChunkQuerier, error)
}

// ChunkQuerier provides querying access over time series data of a fixed time range.
type ChunkQuerier interface {
        LabelQuerier

        // Select returns a set of series that matches the given label matchers.
        // Caller can specify if it requires returned series to be sorted. Prefer not requiring sorting for better performance.
        // It allows passing hints that can help in optimising select, but it's up to implementation how this is used if used at all.
        Select(ctx context.Context, sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) ChunkSeriesSet
}

// LabelQuerier provides querying access over labels.
type LabelQuerier interface {
        // LabelValues returns all potential values for a label name.
        // It is not safe to use the strings beyond the lifetime of the querier.
        // If matchers are specified the returned result set is reduced
        // to label values of metrics matching the matchers.
        LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error)

        // LabelNames returns all the unique label names present in the block in sorted order.
        // If matchers are specified the returned result set is reduced
        // to label names of metrics matching the matchers.
        LabelNames(ctx context.Context, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error)

        // Close releases the resources of the Querier.
        Close() error
}

type ExemplarQueryable interface {
        // ExemplarQuerier returns a new ExemplarQuerier on the storage.
        ExemplarQuerier(ctx context.Context) (ExemplarQuerier, error)
}

// ExemplarQuerier provides reading access to time series data.
type ExemplarQuerier interface {
        // Select all the exemplars that match the matchers.
        // Within a single slice of matchers, it is an intersection. Between the slices, it is a union.
        Select(start, end int64, matchers ...[]*labels.Matcher) ([]exemplar.QueryResult, error)
}

// SelectHints specifies hints passed for data selections.
// This is used only as an option for implementation to use.
type SelectHints struct {
        Start int64 // Start time in milliseconds for this select.
        End   int64 // End time in milliseconds for this select.

        Step int64  // Query step size in milliseconds.
        Func string // String representation of surrounding function or aggregation.

        Grouping []string // List of label names used in aggregation.
        By       bool     // Indicate whether it is without or by.
        Range    int64    // Range vector selector range in milliseconds.

        // ShardCount is the total number of shards that series should be split into
        // at query time. Then, only series in the ShardIndex shard will be returned
        // by the query.
        //
        // ShardCount equal to 0 means that sharding is disabled.
        ShardCount uint64

        // ShardIndex is the series shard index to query. The index must be between 0 and ShardCount-1.
        // When ShardCount is set to a value > 0, then a query will only process series within the
        // ShardIndex's shard.
        //
        // Series are sharded by "labels stable hash" mod "ShardCount".
        ShardIndex uint64

        // DisableTrimming allows to disable trimming of matching series chunks based on query Start and End time.
        // When disabled, the result may contain samples outside the queried time range but Select() performances
        // may be improved.
        DisableTrimming bool
}

// TODO(bwplotka): Move to promql/engine_test.go?
// QueryableFunc is an adapter to allow the use of ordinary functions as
// Queryables. It follows the idea of http.HandlerFunc.
type QueryableFunc func(mint, maxt int64) (Querier, error)

// Querier calls f() with the given parameters.
func (f QueryableFunc) Querier(mint, maxt int64) (Querier, error) {
        return f(mint, maxt)
}

// Appender provides batched appends against a storage.
// It must be completed with a call to Commit or Rollback and must not be reused afterwards.
//
// Operations on the Appender interface are not goroutine-safe.
//
// The type of samples (float64, histogram, etc) appended for a given series must remain same within an Appender.
// The behaviour is undefined if samples of different types are appended to the same series in a single Commit().
type Appender interface {
        // Append adds a sample pair for the given series.
        // An optional series reference can be provided to accelerate calls.
        // A series reference number is returned which can be used to add further
        // samples to the given series in the same or later transactions.
        // Returned reference numbers are ephemeral and may be rejected in calls
        // to Append() at any point. Adding the sample via Append() returns a new
        // reference number.
        // If the reference is 0 it must not be used for caching.
        Append(ref SeriesRef, l labels.Labels, t int64, v float64) (SeriesRef, error)

        // Commit submits the collected samples and purges the batch. If Commit
        // returns a non-nil error, it also rolls back all modifications made in
        // the appender so far, as Rollback would do. In any case, an Appender
        // must not be used anymore after Commit has been called.
        Commit() error

        // Rollback rolls back all modifications made in the appender so far.
        // Appender has to be discarded after rollback.
        Rollback() error

        ExemplarAppender
        HistogramAppender
        MetadataUpdater
        CreatedTimestampAppender
}

// GetRef is an extra interface on Appenders used by downstream projects
// (e.g. Cortex) to avoid maintaining a parallel set of references.
type GetRef interface {
        // Returns reference number that can be used to pass to Appender.Append(),
        // and a set of labels that will not cause another copy when passed to Appender.Append().
        // 0 means the appender does not have a reference to this series.
        // hash should be a hash of lset.
        GetRef(lset labels.Labels, hash uint64) (SeriesRef, labels.Labels)
}

// ExemplarAppender provides an interface for adding samples to exemplar storage, which
// within Prometheus is in-memory only.
type ExemplarAppender interface {
        // AppendExemplar adds an exemplar for the given series labels.
        // An optional reference number can be provided to accelerate calls.
        // A reference number is returned which can be used to add further
        // exemplars in the same or later transactions.
        // Returned reference numbers are ephemeral and may be rejected in calls
        // to Append() at any point. Adding the sample via Append() returns a new
        // reference number.
        // If the reference is 0 it must not be used for caching.
        // Note that in our current implementation of Prometheus' exemplar storage
        // calls to Append should generate the reference numbers, AppendExemplar
        // generating a new reference number should be considered possible erroneous behaviour and be logged.
        AppendExemplar(ref SeriesRef, l labels.Labels, e exemplar.Exemplar) (SeriesRef, error)
}

// HistogramAppender provides an interface for appending histograms to the storage.
type HistogramAppender interface {
        // AppendHistogram adds a histogram for the given series labels. An
        // optional reference number can be provided to accelerate calls. A
        // reference number is returned which can be used to add further
        // histograms in the same or later transactions. Returned reference
        // numbers are ephemeral and may be rejected in calls to Append() at any
        // point. Adding the sample via Append() returns a new reference number.
        // If the reference is 0, it must not be used for caching.
        //
        // For efficiency reasons, the histogram is passed as a
        // pointer. AppendHistogram won't mutate the histogram, but in turn
        // depends on the caller to not mutate it either.
        AppendHistogram(ref SeriesRef, l labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (SeriesRef, error)
}

// MetadataUpdater provides an interface for associating metadata to stored series.
type MetadataUpdater interface {
        // UpdateMetadata updates a metadata entry for the given series and labels.
        // A series reference number is returned which can be used to modify the
        // metadata of the given series in the same or later transactions.
        // Returned reference numbers are ephemeral and may be rejected in calls
        // to UpdateMetadata() at any point. If the series does not exist,
        // UpdateMetadata returns an error.
        // If the reference is 0 it must not be used for caching.
        UpdateMetadata(ref SeriesRef, l labels.Labels, m metadata.Metadata) (SeriesRef, error)
}

// CreatedTimestampAppender provides an interface for appending CT to storage.
type CreatedTimestampAppender interface {
        // AppendCTZeroSample adds synthetic zero sample for the given ct timestamp,
        // which will be associated with given series, labels and the incoming
        // sample's t (timestamp). AppendCTZeroSample returns error if zero sample can't be
        // appended, for example when ct is too old, or when it would collide with
        // incoming sample (sample has priority).
        //
        // AppendCTZeroSample has to be called before the corresponding sample Append.
        // A series reference number is returned which can be used to modify the
        // CT for the given series in the same or later transactions.
        // Returned reference numbers are ephemeral and may be rejected in calls
        // to AppendCTZeroSample() at any point.
        //
        // If the reference is 0 it must not be used for caching.
        AppendCTZeroSample(ref SeriesRef, l labels.Labels, t, ct int64) (SeriesRef, error)
}

// SeriesSet contains a set of series.
type SeriesSet interface {
        Next() bool
        // At returns full series. Returned series should be iterable even after Next is called.
        At() Series
        // The error that iteration as failed with.
        // When an error occurs, set cannot continue to iterate.
        Err() error
        // A collection of warnings for the whole set.
        // Warnings could be return even iteration has not failed with error.
        Warnings() annotations.Annotations
}

var emptySeriesSet = errSeriesSet{}

// EmptySeriesSet returns a series set that's always empty.
func EmptySeriesSet() SeriesSet {
        return emptySeriesSet
}

type testSeriesSet struct {
        series Series
}

func (s testSeriesSet) Next() bool                        { return true }
func (s testSeriesSet) At() Series                        { return s.series }
func (s testSeriesSet) Err() error                        { return nil }
func (s testSeriesSet) Warnings() annotations.Annotations { return nil }

// TestSeriesSet returns a mock series set.
func TestSeriesSet(series Series) SeriesSet {
        return testSeriesSet{series: series}
}

type errSeriesSet struct {
        err error
}

func (s errSeriesSet) Next() bool                        { return false }
func (s errSeriesSet) At() Series                        { return nil }
func (s errSeriesSet) Err() error                        { return s.err }
func (s errSeriesSet) Warnings() annotations.Annotations { return nil }

// ErrSeriesSet returns a series set that wraps an error.
func ErrSeriesSet(err error) SeriesSet {
        return errSeriesSet{err: err}
}

var emptyChunkSeriesSet = errChunkSeriesSet{}

// EmptyChunkSeriesSet returns a chunk series set that's always empty.
func EmptyChunkSeriesSet() ChunkSeriesSet {
        return emptyChunkSeriesSet
}

type errChunkSeriesSet struct {
        err error
}

func (s errChunkSeriesSet) Next() bool                        { return false }
func (s errChunkSeriesSet) At() ChunkSeries                   { return nil }
func (s errChunkSeriesSet) Err() error                        { return s.err }
func (s errChunkSeriesSet) Warnings() annotations.Annotations { return nil }

// ErrChunkSeriesSet returns a chunk series set that wraps an error.
func ErrChunkSeriesSet(err error) ChunkSeriesSet {
        return errChunkSeriesSet{err: err}
}

// Series exposes a single time series and allows iterating over samples.
type Series interface {
        Labels
        SampleIterable
}

type mockSeries struct {
        timestamps []int64
        values     []float64
        labelSet   []string
}

func (s mockSeries) Labels() labels.Labels {
        return labels.FromStrings(s.labelSet...)
}

func (s mockSeries) Iterator(chunkenc.Iterator) chunkenc.Iterator {
        return chunkenc.MockSeriesIterator(s.timestamps, s.values)
}

// MockSeries returns a series with custom timestamps, values and labelSet.
func MockSeries(timestamps []int64, values []float64, labelSet []string) Series {
        return mockSeries{
                timestamps: timestamps,
                values:     values,
                labelSet:   labelSet,
        }
}

// ChunkSeriesSet contains a set of chunked series.
type ChunkSeriesSet interface {
        Next() bool
        // At returns full chunk series. Returned series should be iterable even after Next is called.
        At() ChunkSeries
        // The error that iteration has failed with.
        // When an error occurs, set cannot continue to iterate.
        Err() error
        // A collection of warnings for the whole set.
        // Warnings could be return even iteration has not failed with error.
        Warnings() annotations.Annotations
}

// ChunkSeries exposes a single time series and allows iterating over chunks.
type ChunkSeries interface {
        Labels
        ChunkIterable
}

// Labels represents an item that has labels e.g. time series.
type Labels interface {
        // Labels returns the complete set of labels. For series it means all labels identifying the series.
        Labels() labels.Labels
}

type SampleIterable interface {
        // Iterator returns an iterator of the data of the series.
        // The iterator passed as argument is for re-use, if not nil.
        // Depending on implementation, the iterator can
        // be re-used or a new iterator can be allocated.
        Iterator(chunkenc.Iterator) chunkenc.Iterator
}

type ChunkIterable interface {
        // Iterator returns an iterator that iterates over potentially overlapping
        // chunks of the series, sorted by min time.
        Iterator(chunks.Iterator) chunks.Iterator
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
        "github.com/prometheus/prometheus/util/annotations"
)

// lazyGenericSeriesSet is a wrapped series set that is initialised on first call to Next().
type lazyGenericSeriesSet struct {
        init func() (genericSeriesSet, bool)

        set genericSeriesSet
}

func (c *lazyGenericSeriesSet) Next() bool {
        if c.set != nil {
                return c.set.Next()
        }
        var ok bool
        c.set, ok = c.init()
        return ok
}

func (c *lazyGenericSeriesSet) Err() error {
        if c.set != nil {
                return c.set.Err()
        }
        return nil
}

func (c *lazyGenericSeriesSet) At() Labels {
        if c.set != nil {
                return c.set.At()
        }
        return nil
}

func (c *lazyGenericSeriesSet) Warnings() annotations.Annotations {
        if c.set != nil {
                return c.set.Warnings()
        }
        return nil
}

type warningsOnlySeriesSet annotations.Annotations

func (warningsOnlySeriesSet) Next() bool                          { return false }
func (warningsOnlySeriesSet) Err() error                          { return nil }
func (warningsOnlySeriesSet) At() Labels                          { return nil }
func (c warningsOnlySeriesSet) Warnings() annotations.Annotations { return annotations.Annotations(c) }

type errorOnlySeriesSet struct {
        err error
}

func (errorOnlySeriesSet) Next() bool                        { return false }
func (errorOnlySeriesSet) At() Labels                        { return nil }
func (s errorOnlySeriesSet) Err() error                      { return s.err }
func (errorOnlySeriesSet) Warnings() annotations.Annotations { return nil }

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
        "math"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
)

// MemoizedSeriesIterator wraps an iterator with a buffer to look back the previous element.
//
// This iterator regards integer histograms as float histograms; calls to Seek() will never return chunkenc.Histogram.
// This iterator deliberately does not implement chunkenc.Iterator.
type MemoizedSeriesIterator struct {
        it    chunkenc.Iterator
        delta int64

        lastTime  int64
        valueType chunkenc.ValueType

        // Keep track of the previously returned value.
        prevTime           int64
        prevValue          float64
        prevFloatHistogram *histogram.FloatHistogram
}

// NewMemoizedEmptyIterator is like NewMemoizedIterator but it's initialised with an empty iterator.
func NewMemoizedEmptyIterator(delta int64) *MemoizedSeriesIterator {
        return NewMemoizedIterator(chunkenc.NewNopIterator(), delta)
}

// NewMemoizedIterator returns a new iterator that buffers the values within the
// time range of the current element and the duration of delta before.
func NewMemoizedIterator(it chunkenc.Iterator, delta int64) *MemoizedSeriesIterator {
        bit := &MemoizedSeriesIterator{
                delta:    delta,
                prevTime: math.MinInt64,
        }
        bit.Reset(it)

        return bit
}

// Reset the internal state to reuse the wrapper with the provided iterator.
func (b *MemoizedSeriesIterator) Reset(it chunkenc.Iterator) {
        b.it = it
        b.lastTime = math.MinInt64
        b.prevTime = math.MinInt64
        b.valueType = it.Next()
}

// PeekPrev returns the previous element of the iterator. If there is none buffered,
// ok is false.
func (b *MemoizedSeriesIterator) PeekPrev() (t int64, v float64, fh *histogram.FloatHistogram, ok bool) {
        if b.prevTime == math.MinInt64 {
                return 0, 0, nil, false
        }
        return b.prevTime, b.prevValue, b.prevFloatHistogram, true
}

// Seek advances the iterator to the element at time t or greater.
func (b *MemoizedSeriesIterator) Seek(t int64) chunkenc.ValueType {
        t0 := t - b.delta

        if b.valueType != chunkenc.ValNone && t0 > b.lastTime {
                // Reset the previously stored element because the seek advanced
                // more than the delta.
                b.prevTime = math.MinInt64

                b.valueType = b.it.Seek(t0)
                switch b.valueType {
                case chunkenc.ValNone:
                        return chunkenc.ValNone
                case chunkenc.ValHistogram:
                        b.valueType = chunkenc.ValFloatHistogram
                }
                b.lastTime = b.it.AtT()
        }
        if b.lastTime >= t {
                return b.valueType
        }
        for b.Next() != chunkenc.ValNone {
                if b.lastTime >= t {
                        return b.valueType
                }
        }

        return chunkenc.ValNone
}

// Next advances the iterator to the next element. Note that this does not check whether the element being buffered is
// within the time range of the current element and the duration of delta before.
func (b *MemoizedSeriesIterator) Next() chunkenc.ValueType {
        // Keep track of the previous element.
        switch b.valueType {
        case chunkenc.ValNone:
                return chunkenc.ValNone
        case chunkenc.ValFloat:
                b.prevTime, b.prevValue = b.it.At()
                b.prevFloatHistogram = nil
        case chunkenc.ValHistogram, chunkenc.ValFloatHistogram:
                b.prevValue = 0
                b.prevTime, b.prevFloatHistogram = b.it.AtFloatHistogram(nil)
        }

        b.valueType = b.it.Next()
        if b.valueType != chunkenc.ValNone {
                b.lastTime = b.it.AtT()
        }
        if b.valueType == chunkenc.ValHistogram {
                b.valueType = chunkenc.ValFloatHistogram
        }
        return b.valueType
}

// At returns the current float element of the iterator.
func (b *MemoizedSeriesIterator) At() (int64, float64) {
        return b.it.At()
}

// AtFloatHistogram returns the current float-histogram element of the iterator.
func (b *MemoizedSeriesIterator) AtFloatHistogram() (int64, *histogram.FloatHistogram) {
        return b.it.AtFloatHistogram(nil)
}

// Err returns the last encountered error.
func (b *MemoizedSeriesIterator) Err() error {
        return b.it.Err()
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
        "bytes"
        "container/heap"
        "context"
        "fmt"
        "math"
        "slices"
        "sync"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/util/annotations"
)

type mergeGenericQuerier struct {
        queriers []genericQuerier

        // mergeFn is used when we see series from different queriers Selects with the same labels.
        mergeFn genericSeriesMergeFunc

        // TODO(bwplotka): Remove once remote queries are asynchronous. False by default.
        concurrentSelect bool
}

// NewMergeQuerier returns a new Querier that merges results of given primary and secondary queriers.
// See NewFanout commentary to learn more about primary vs secondary differences.
//
// In case of overlaps between the data given by primaries' and secondaries' Selects, merge function will be used.
func NewMergeQuerier(primaries, secondaries []Querier, mergeFn VerticalSeriesMergeFunc) Querier {
        switch {
        case len(primaries)+len(secondaries) == 0:
                return noopQuerier{}
        case len(primaries) == 1 && len(secondaries) == 0:
                return primaries[0]
        case len(primaries) == 0 && len(secondaries) == 1:
                return secondaries[0]
        }

        queriers := make([]genericQuerier, 0, len(primaries)+len(secondaries))
        for _, q := range primaries {
                if _, ok := q.(noopQuerier); !ok && q != nil {
                        queriers = append(queriers, newGenericQuerierFrom(q))
                }
        }
        for _, q := range secondaries {
                if _, ok := q.(noopQuerier); !ok && q != nil {
                        queriers = append(queriers, newSecondaryQuerierFrom(q))
                }
        }

        concurrentSelect := false
        if len(secondaries) > 0 {
                concurrentSelect = true
        }
        return &querierAdapter{&mergeGenericQuerier{
                mergeFn:          (&seriesMergerAdapter{VerticalSeriesMergeFunc: mergeFn}).Merge,
                queriers:         queriers,
                concurrentSelect: concurrentSelect,
        }}
}

// NewMergeChunkQuerier returns a new Chunk Querier that merges results of given primary and secondary chunk queriers.
// See NewFanout commentary to learn more about primary vs secondary differences.
//
// In case of overlaps between the data given by primaries' and secondaries' Selects, merge function will be used.
// TODO(bwplotka): Currently merge will compact overlapping chunks with bigger chunk, without limit. Split it: https://github.com/prometheus/tsdb/issues/670
func NewMergeChunkQuerier(primaries, secondaries []ChunkQuerier, mergeFn VerticalChunkSeriesMergeFunc) ChunkQuerier {
        switch {
        case len(primaries) == 0 && len(secondaries) == 0:
                return noopChunkQuerier{}
        case len(primaries) == 1 && len(secondaries) == 0:
                return primaries[0]
        case len(primaries) == 0 && len(secondaries) == 1:
                return secondaries[0]
        }

        queriers := make([]genericQuerier, 0, len(primaries)+len(secondaries))
        for _, q := range primaries {
                if _, ok := q.(noopChunkQuerier); !ok && q != nil {
                        queriers = append(queriers, newGenericQuerierFromChunk(q))
                }
        }
        for _, querier := range secondaries {
                if _, ok := querier.(noopChunkQuerier); !ok && querier != nil {
                        queriers = append(queriers, newSecondaryQuerierFromChunk(querier))
                }
        }

        concurrentSelect := false
        if len(secondaries) > 0 {
                concurrentSelect = true
        }
        return &chunkQuerierAdapter{&mergeGenericQuerier{
                mergeFn:          (&chunkSeriesMergerAdapter{VerticalChunkSeriesMergeFunc: mergeFn}).Merge,
                queriers:         queriers,
                concurrentSelect: concurrentSelect,
        }}
}

// Select returns a set of series that matches the given label matchers.
func (q *mergeGenericQuerier) Select(ctx context.Context, sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) genericSeriesSet {
        seriesSets := make([]genericSeriesSet, 0, len(q.queriers))
        if !q.concurrentSelect {
                for _, querier := range q.queriers {
                        // We need to sort for merge  to work.
                        seriesSets = append(seriesSets, querier.Select(ctx, true, hints, matchers...))
                }
                return &lazyGenericSeriesSet{init: func() (genericSeriesSet, bool) {
                        s := newGenericMergeSeriesSet(seriesSets, q.mergeFn)
                        return s, s.Next()
                }}
        }

        var (
                wg            sync.WaitGroup
                seriesSetChan = make(chan genericSeriesSet)
        )
        // Schedule all Selects for all queriers we know about.
        for _, querier := range q.queriers {
                wg.Add(1)
                go func(qr genericQuerier) {
                        defer wg.Done()

                        // We need to sort for NewMergeSeriesSet to work.
                        seriesSetChan <- qr.Select(ctx, true, hints, matchers...)
                }(querier)
        }
        go func() {
                wg.Wait()
                close(seriesSetChan)
        }()

        for r := range seriesSetChan {
                seriesSets = append(seriesSets, r)
        }
        return &lazyGenericSeriesSet{init: func() (genericSeriesSet, bool) {
                s := newGenericMergeSeriesSet(seriesSets, q.mergeFn)
                return s, s.Next()
        }}
}

type labelGenericQueriers []genericQuerier

func (l labelGenericQueriers) Len() int               { return len(l) }
func (l labelGenericQueriers) Get(i int) LabelQuerier { return l[i] }
func (l labelGenericQueriers) SplitByHalf() (labelGenericQueriers, labelGenericQueriers) {
        i := len(l) / 2
        return l[:i], l[i:]
}

// LabelValues returns all potential values for a label name.
// If matchers are specified the returned result set is reduced
// to label values of metrics matching the matchers.
func (q *mergeGenericQuerier) LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        res, ws, err := q.lvals(ctx, q.queriers, name, matchers...)
        if err != nil {
                return nil, nil, fmt.Errorf("LabelValues() from merge generic querier for label %s: %w", name, err)
        }
        return res, ws, nil
}

// lvals performs merge sort for LabelValues from multiple queriers.
func (q *mergeGenericQuerier) lvals(ctx context.Context, lq labelGenericQueriers, n string, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        if lq.Len() == 0 {
                return nil, nil, nil
        }
        if lq.Len() == 1 {
                return lq.Get(0).LabelValues(ctx, n, matchers...)
        }
        a, b := lq.SplitByHalf()

        var ws annotations.Annotations
        s1, w, err := q.lvals(ctx, a, n, matchers...)
        ws.Merge(w)
        if err != nil {
                return nil, ws, err
        }
        s2, ws, err := q.lvals(ctx, b, n, matchers...)
        ws.Merge(w)
        if err != nil {
                return nil, ws, err
        }
        return mergeStrings(s1, s2), ws, nil
}

func mergeStrings(a, b []string) []string {
        maxl := len(a)
        if len(b) > len(a) {
                maxl = len(b)
        }
        res := make([]string, 0, maxl*10/9)

        for len(a) > 0 && len(b) > 0 {
                switch {
                case a[0] == b[0]:
                        res = append(res, a[0])
                        a, b = a[1:], b[1:]
                case a[0] < b[0]:
                        res = append(res, a[0])
                        a = a[1:]
                default:
                        res = append(res, b[0])
                        b = b[1:]
                }
        }

        // Append all remaining elements.
        res = append(res, a...)
        res = append(res, b...)
        return res
}

// LabelNames returns all the unique label names present in all queriers in sorted order.
func (q *mergeGenericQuerier) LabelNames(ctx context.Context, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        var (
                labelNamesMap = make(map[string]struct{})
                warnings      annotations.Annotations
        )
        for _, querier := range q.queriers {
                names, wrn, err := querier.LabelNames(ctx, matchers...)
                if wrn != nil {
                        // TODO(bwplotka): We could potentially wrap warnings.
                        warnings.Merge(wrn)
                }
                if err != nil {
                        return nil, nil, fmt.Errorf("LabelNames() from merge generic querier: %w", err)
                }
                for _, name := range names {
                        labelNamesMap[name] = struct{}{}
                }
        }
        if len(labelNamesMap) == 0 {
                return nil, warnings, nil
        }

        labelNames := make([]string, 0, len(labelNamesMap))
        for name := range labelNamesMap {
                labelNames = append(labelNames, name)
        }
        slices.Sort(labelNames)
        return labelNames, warnings, nil
}

// Close releases the resources of the generic querier.
func (q *mergeGenericQuerier) Close() error {
        errs := tsdb_errors.NewMulti()
        for _, querier := range q.queriers {
                if err := querier.Close(); err != nil {
                        errs.Add(err)
                }
        }
        return errs.Err()
}

// VerticalSeriesMergeFunc returns merged series implementation that merges series with same labels together.
// It has to handle time-overlapped series as well.
type VerticalSeriesMergeFunc func(...Series) Series

// NewMergeSeriesSet returns a new SeriesSet that merges many SeriesSets together.
func NewMergeSeriesSet(sets []SeriesSet, mergeFunc VerticalSeriesMergeFunc) SeriesSet {
        genericSets := make([]genericSeriesSet, 0, len(sets))
        for _, s := range sets {
                genericSets = append(genericSets, &genericSeriesSetAdapter{s})
        }
        return &seriesSetAdapter{newGenericMergeSeriesSet(genericSets, (&seriesMergerAdapter{VerticalSeriesMergeFunc: mergeFunc}).Merge)}
}

// VerticalChunkSeriesMergeFunc returns merged chunk series implementation that merges potentially time-overlapping
// chunk series with the same labels into single ChunkSeries.
//
// NOTE: It's up to implementation how series are vertically merged (if chunks are sorted, re-encoded etc).
type VerticalChunkSeriesMergeFunc func(...ChunkSeries) ChunkSeries

// NewMergeChunkSeriesSet returns a new ChunkSeriesSet that merges many SeriesSet together.
func NewMergeChunkSeriesSet(sets []ChunkSeriesSet, mergeFunc VerticalChunkSeriesMergeFunc) ChunkSeriesSet {
        genericSets := make([]genericSeriesSet, 0, len(sets))
        for _, s := range sets {
                genericSets = append(genericSets, &genericChunkSeriesSetAdapter{s})
        }
        return &chunkSeriesSetAdapter{newGenericMergeSeriesSet(genericSets, (&chunkSeriesMergerAdapter{VerticalChunkSeriesMergeFunc: mergeFunc}).Merge)}
}

// genericMergeSeriesSet implements genericSeriesSet.
type genericMergeSeriesSet struct {
        currentLabels labels.Labels
        mergeFunc     genericSeriesMergeFunc

        heap        genericSeriesSetHeap
        sets        []genericSeriesSet
        currentSets []genericSeriesSet
}

// newGenericMergeSeriesSet returns a new genericSeriesSet that merges (and deduplicates)
// series returned by the series sets when iterating.
// Each series set must return its series in labels order, otherwise
// merged series set will be incorrect.
// Overlapped situations are merged using provided mergeFunc.
func newGenericMergeSeriesSet(sets []genericSeriesSet, mergeFunc genericSeriesMergeFunc) genericSeriesSet {
        if len(sets) == 1 {
                return sets[0]
        }

        // We are pre-advancing sets, so we can introspect the label of the
        // series under the cursor.
        var h genericSeriesSetHeap
        for _, set := range sets {
                if set == nil {
                        continue
                }
                if set.Next() {
                        heap.Push(&h, set)
                }
                if err := set.Err(); err != nil {
                        return errorOnlySeriesSet{err}
                }
        }
        return &genericMergeSeriesSet{
                mergeFunc: mergeFunc,
                sets:      sets,
                heap:      h,
        }
}

func (c *genericMergeSeriesSet) Next() bool {
        // Run in a loop because the "next" series sets may not be valid anymore.
        // If, for the current label set, all the next series sets come from
        // failed remote storage sources, we want to keep trying with the next label set.
        for {
                // Firstly advance all the current series sets. If any of them have run out,
                // we can drop them, otherwise they should be inserted back into the heap.
                for _, set := range c.currentSets {
                        if set.Next() {
                                heap.Push(&c.heap, set)
                        }
                }

                if len(c.heap) == 0 {
                        return false
                }

                // Now, pop items of the heap that have equal label sets.
                c.currentSets = c.currentSets[:0]
                c.currentLabels = c.heap[0].At().Labels()
                for len(c.heap) > 0 && labels.Equal(c.currentLabels, c.heap[0].At().Labels()) {
                        set := heap.Pop(&c.heap).(genericSeriesSet)
                        c.currentSets = append(c.currentSets, set)
                }

                // As long as the current set contains at least 1 set,
                // then it should return true.
                if len(c.currentSets) != 0 {
                        break
                }
        }
        return true
}

func (c *genericMergeSeriesSet) At() Labels {
        if len(c.currentSets) == 1 {
                return c.currentSets[0].At()
        }
        series := make([]Labels, 0, len(c.currentSets))
        for _, seriesSet := range c.currentSets {
                series = append(series, seriesSet.At())
        }
        return c.mergeFunc(series...)
}

func (c *genericMergeSeriesSet) Err() error {
        for _, set := range c.sets {
                if err := set.Err(); err != nil {
                        return err
                }
        }
        return nil
}

func (c *genericMergeSeriesSet) Warnings() annotations.Annotations {
        var ws annotations.Annotations
        for _, set := range c.sets {
                ws.Merge(set.Warnings())
        }
        return ws
}

type genericSeriesSetHeap []genericSeriesSet

func (h genericSeriesSetHeap) Len() int      { return len(h) }
func (h genericSeriesSetHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }

func (h genericSeriesSetHeap) Less(i, j int) bool {
        a, b := h[i].At().Labels(), h[j].At().Labels()
        return labels.Compare(a, b) < 0
}

func (h *genericSeriesSetHeap) Push(x interface{}) {
        *h = append(*h, x.(genericSeriesSet))
}

func (h *genericSeriesSetHeap) Pop() interface{} {
        old := *h
        n := len(old)
        x := old[n-1]
        *h = old[0 : n-1]
        return x
}

// ChainedSeriesMerge returns single series from many same, potentially overlapping series by chaining samples together.
// If one or more samples overlap, one sample from random overlapped ones is kept and all others with the same
// timestamp are dropped.
//
// This works the best with replicated series, where data from two series are exactly the same. This does not work well
// with "almost" the same data, e.g. from 2 Prometheus HA replicas. This is fine, since from the Prometheus perspective
// this never happens.
//
// It's optimized for non-overlap cases as well.
func ChainedSeriesMerge(series ...Series) Series {
        if len(series) == 0 {
                return nil
        }
        return &SeriesEntry{
                Lset: series[0].Labels(),
                SampleIteratorFn: func(it chunkenc.Iterator) chunkenc.Iterator {
                        return ChainSampleIteratorFromSeries(it, series)
                },
        }
}

// chainSampleIterator is responsible to iterate over samples from different iterators of the same time series in timestamps
// order. If one or more samples overlap, one sample from random overlapped ones is kept and all others with the same
// timestamp are dropped. It's optimized for non-overlap cases as well.
type chainSampleIterator struct {
        iterators []chunkenc.Iterator
        h         samplesIteratorHeap

        curr  chunkenc.Iterator
        lastT int64

        // Whether the previous and the current sample are direct neighbors
        // within the same base iterator.
        consecutive bool
}

// Return a chainSampleIterator initialized for length entries, re-using the memory from it if possible.
func getChainSampleIterator(it chunkenc.Iterator, length int) *chainSampleIterator {
        csi, ok := it.(*chainSampleIterator)
        if !ok {
                csi = &chainSampleIterator{}
        }
        if cap(csi.iterators) < length {
                csi.iterators = make([]chunkenc.Iterator, length)
        } else {
                csi.iterators = csi.iterators[:length]
        }
        csi.h = nil
        csi.lastT = math.MinInt64
        return csi
}

func ChainSampleIteratorFromSeries(it chunkenc.Iterator, series []Series) chunkenc.Iterator {
        csi := getChainSampleIterator(it, len(series))
        for i, s := range series {
                csi.iterators[i] = s.Iterator(csi.iterators[i])
        }
        return csi
}

func ChainSampleIteratorFromIterables(it chunkenc.Iterator, iterables []chunkenc.Iterable) chunkenc.Iterator {
        csi := getChainSampleIterator(it, len(iterables))
        for i, c := range iterables {
                csi.iterators[i] = c.Iterator(csi.iterators[i])
        }
        return csi
}

func ChainSampleIteratorFromIterators(it chunkenc.Iterator, iterators []chunkenc.Iterator) chunkenc.Iterator {
        csi := getChainSampleIterator(it, 0)
        csi.iterators = iterators
        return csi
}

func (c *chainSampleIterator) Seek(t int64) chunkenc.ValueType {
        // No-op check.
        if c.curr != nil && c.lastT >= t {
                return c.curr.Seek(c.lastT)
        }
        // Don't bother to find out if the next sample is consecutive. Callers
        // of Seek usually aren't interested anyway.
        c.consecutive = false
        c.h = samplesIteratorHeap{}
        for _, iter := range c.iterators {
                if iter.Seek(t) == chunkenc.ValNone {
                        if iter.Err() != nil {
                                // If any iterator is reporting an error, abort.
                                return chunkenc.ValNone
                        }
                        continue
                }
                heap.Push(&c.h, iter)
        }
        if len(c.h) > 0 {
                c.curr = heap.Pop(&c.h).(chunkenc.Iterator)
                c.lastT = c.curr.AtT()
                return c.curr.Seek(c.lastT)
        }
        c.curr = nil
        return chunkenc.ValNone
}

func (c *chainSampleIterator) At() (t int64, v float64) {
        if c.curr == nil {
                panic("chainSampleIterator.At called before first .Next or after .Next returned false.")
        }
        return c.curr.At()
}

func (c *chainSampleIterator) AtHistogram(h *histogram.Histogram) (int64, *histogram.Histogram) {
        if c.curr == nil {
                panic("chainSampleIterator.AtHistogram called before first .Next or after .Next returned false.")
        }
        t, h := c.curr.AtHistogram(h)
        // If the current sample is not consecutive with the previous one, we
        // cannot be sure anymore about counter resets for counter histograms.
        // TODO(beorn7): If a `NotCounterReset` sample is followed by a
        // non-consecutive `CounterReset` sample, we could keep the hint as
        // `CounterReset`. But then we needed to track the previous sample
        // in more detail, which might not be worth it.
        if !c.consecutive && h.CounterResetHint != histogram.GaugeType {
                h.CounterResetHint = histogram.UnknownCounterReset
        }
        return t, h
}

func (c *chainSampleIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        if c.curr == nil {
                panic("chainSampleIterator.AtFloatHistogram called before first .Next or after .Next returned false.")
        }
        t, fh := c.curr.AtFloatHistogram(fh)
        // If the current sample is not consecutive with the previous one, we
        // cannot be sure anymore about counter resets for counter histograms.
        // TODO(beorn7): If a `NotCounterReset` sample is followed by a
        // non-consecutive `CounterReset` sample, we could keep the hint as
        // `CounterReset`. But then we needed to track the previous sample
        // in more detail, which might not be worth it.
        if !c.consecutive && fh.CounterResetHint != histogram.GaugeType {
                fh.CounterResetHint = histogram.UnknownCounterReset
        }
        return t, fh
}

func (c *chainSampleIterator) AtT() int64 {
        if c.curr == nil {
                panic("chainSampleIterator.AtT called before first .Next or after .Next returned false.")
        }
        return c.curr.AtT()
}

func (c *chainSampleIterator) Next() chunkenc.ValueType {
        var (
                currT           int64
                currValueType   chunkenc.ValueType
                iteratorChanged bool
        )
        if c.h == nil {
                iteratorChanged = true
                c.h = samplesIteratorHeap{}
                // We call c.curr.Next() as the first thing below.
                // So, we don't call Next() on it here.
                c.curr = c.iterators[0]
                for _, iter := range c.iterators[1:] {
                        if iter.Next() == chunkenc.ValNone {
                                if iter.Err() != nil {
                                        // If any iterator is reporting an error, abort.
                                        // If c.iterators[0] is reporting an error, we'll handle that below.
                                        return chunkenc.ValNone
                                }
                        } else {
                                heap.Push(&c.h, iter)
                        }
                }
        }

        if c.curr == nil {
                return chunkenc.ValNone
        }

        for {
                currValueType = c.curr.Next()

                if currValueType == chunkenc.ValNone {
                        if c.curr.Err() != nil {
                                // Abort if we've hit an error.
                                return chunkenc.ValNone
                        }

                        if len(c.h) == 0 {
                                // No iterator left to iterate.
                                c.curr = nil
                                return chunkenc.ValNone
                        }
                } else {
                        currT = c.curr.AtT()
                        if currT == c.lastT {
                                // Ignoring sample for the same timestamp.
                                continue
                        }
                        if len(c.h) == 0 {
                                // curr is the only iterator remaining,
                                // no need to check with the heap.
                                break
                        }

                        // Check current iterator with the top of the heap.
                        nextT := c.h[0].AtT()
                        if currT < nextT {
                                // Current iterator has smaller timestamp than the heap.
                                break
                        }
                        // Current iterator does not hold the smallest timestamp.
                        heap.Push(&c.h, c.curr)
                }

                c.curr = heap.Pop(&c.h).(chunkenc.Iterator)
                iteratorChanged = true
                currT = c.curr.AtT()
                currValueType = c.curr.Seek(currT)
                if currT != c.lastT {
                        break
                }
        }

        c.consecutive = !iteratorChanged
        c.lastT = currT
        return currValueType
}

func (c *chainSampleIterator) Err() error {
        errs := tsdb_errors.NewMulti()
        for _, iter := range c.iterators {
                errs.Add(iter.Err())
        }
        return errs.Err()
}

type samplesIteratorHeap []chunkenc.Iterator

func (h samplesIteratorHeap) Len() int      { return len(h) }
func (h samplesIteratorHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }

func (h samplesIteratorHeap) Less(i, j int) bool {
        return h[i].AtT() < h[j].AtT()
}

func (h *samplesIteratorHeap) Push(x interface{}) {
        *h = append(*h, x.(chunkenc.Iterator))
}

func (h *samplesIteratorHeap) Pop() interface{} {
        old := *h
        n := len(old)
        x := old[n-1]
        *h = old[0 : n-1]
        return x
}

// NewCompactingChunkSeriesMerger returns VerticalChunkSeriesMergeFunc that merges the same chunk series into single chunk series.
// In case of the chunk overlaps, it compacts those into one or more time-ordered non-overlapping chunks with merged data.
// Samples from overlapped chunks are merged using series vertical merge func.
// It expects the same labels for each given series.
//
// NOTE: Use the returned merge function only when you see potentially overlapping series, as this introduces small a overhead
// to handle overlaps between series.
func NewCompactingChunkSeriesMerger(mergeFunc VerticalSeriesMergeFunc) VerticalChunkSeriesMergeFunc {
        return func(series ...ChunkSeries) ChunkSeries {
                if len(series) == 0 {
                        return nil
                }
                return &ChunkSeriesEntry{
                        Lset: series[0].Labels(),
                        ChunkIteratorFn: func(chunks.Iterator) chunks.Iterator {
                                iterators := make([]chunks.Iterator, 0, len(series))
                                for _, s := range series {
                                        iterators = append(iterators, s.Iterator(nil))
                                }
                                return &compactChunkIterator{
                                        mergeFunc: mergeFunc,
                                        iterators: iterators,
                                }
                        },
                }
        }
}

// compactChunkIterator is responsible to compact chunks from different iterators of the same time series into single chainSeries.
// If time-overlapping chunks are found, they are encoded and passed to series merge and encoded again into one bigger chunk.
// TODO(bwplotka): Currently merge will compact overlapping chunks with bigger chunk, without limit. Split it: https://github.com/prometheus/tsdb/issues/670
type compactChunkIterator struct {
        mergeFunc VerticalSeriesMergeFunc
        iterators []chunks.Iterator

        h chunkIteratorHeap

        err  error
        curr chunks.Meta
}

func (c *compactChunkIterator) At() chunks.Meta {
        return c.curr
}

func (c *compactChunkIterator) Next() bool {
        if c.h == nil {
                for _, iter := range c.iterators {
                        if iter.Next() {
                                heap.Push(&c.h, iter)
                        }
                }
        }
        if len(c.h) == 0 {
                return false
        }

        iter := heap.Pop(&c.h).(chunks.Iterator)
        c.curr = iter.At()
        if iter.Next() {
                heap.Push(&c.h, iter)
        }

        var (
                overlapping []Series
                oMaxTime    = c.curr.MaxTime
                prev        = c.curr
        )
        // Detect overlaps to compact. Be smart about it and deduplicate on the fly if chunks are identical.
        for len(c.h) > 0 {
                // Get the next oldest chunk by min, then max time.
                next := c.h[0].At()
                if next.MinTime > oMaxTime {
                        // No overlap with current one.
                        break
                }

                // Only do something if it is not a perfect duplicate.
                if next.MinTime != prev.MinTime ||
                        next.MaxTime != prev.MaxTime ||
                        !bytes.Equal(next.Chunk.Bytes(), prev.Chunk.Bytes()) {
                        // We operate on same series, so labels do not matter here.
                        overlapping = append(overlapping, newChunkToSeriesDecoder(labels.EmptyLabels(), next))
                        if next.MaxTime > oMaxTime {
                                oMaxTime = next.MaxTime
                        }
                        prev = next
                }

                iter := heap.Pop(&c.h).(chunks.Iterator)
                if iter.Next() {
                        heap.Push(&c.h, iter)
                }
        }
        if len(overlapping) == 0 {
                return true
        }

        // Add last as it's not yet included in overlap. We operate on same series, so labels does not matter here.
        iter = NewSeriesToChunkEncoder(c.mergeFunc(append(overlapping, newChunkToSeriesDecoder(labels.EmptyLabels(), c.curr))...)).Iterator(nil)
        if !iter.Next() {
                if c.err = iter.Err(); c.err != nil {
                        return false
                }
                panic("unexpected seriesToChunkEncoder lack of iterations")
        }
        c.curr = iter.At()
        if iter.Next() {
                heap.Push(&c.h, iter)
        }
        return true
}

func (c *compactChunkIterator) Err() error {
        errs := tsdb_errors.NewMulti()
        for _, iter := range c.iterators {
                errs.Add(iter.Err())
        }
        errs.Add(c.err)
        return errs.Err()
}

type chunkIteratorHeap []chunks.Iterator

func (h chunkIteratorHeap) Len() int      { return len(h) }
func (h chunkIteratorHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }

func (h chunkIteratorHeap) Less(i, j int) bool {
        at := h[i].At()
        bt := h[j].At()
        if at.MinTime == bt.MinTime {
                return at.MaxTime < bt.MaxTime
        }
        return at.MinTime < bt.MinTime
}

func (h *chunkIteratorHeap) Push(x interface{}) {
        *h = append(*h, x.(chunks.Iterator))
}

func (h *chunkIteratorHeap) Pop() interface{} {
        old := *h
        n := len(old)
        x := old[n-1]
        *h = old[0 : n-1]
        return x
}

// NewConcatenatingChunkSeriesMerger returns a VerticalChunkSeriesMergeFunc that simply concatenates the
// chunks from the series. The resultant stream of chunks for a series might be overlapping and unsorted.
func NewConcatenatingChunkSeriesMerger() VerticalChunkSeriesMergeFunc {
        return func(series ...ChunkSeries) ChunkSeries {
                if len(series) == 0 {
                        return nil
                }
                return &ChunkSeriesEntry{
                        Lset: series[0].Labels(),
                        ChunkIteratorFn: func(chunks.Iterator) chunks.Iterator {
                                iterators := make([]chunks.Iterator, 0, len(series))
                                for _, s := range series {
                                        iterators = append(iterators, s.Iterator(nil))
                                }
                                return &concatenatingChunkIterator{
                                        iterators: iterators,
                                }
                        },
                }
        }
}

type concatenatingChunkIterator struct {
        iterators []chunks.Iterator
        idx       int

        curr chunks.Meta
}

func (c *concatenatingChunkIterator) At() chunks.Meta {
        return c.curr
}

func (c *concatenatingChunkIterator) Next() bool {
        if c.idx >= len(c.iterators) {
                return false
        }
        if c.iterators[c.idx].Next() {
                c.curr = c.iterators[c.idx].At()
                return true
        }
        if c.iterators[c.idx].Err() != nil {
                return false
        }
        c.idx++
        return c.Next()
}

func (c *concatenatingChunkIterator) Err() error {
        errs := tsdb_errors.NewMulti()
        for _, iter := range c.iterators {
                errs.Add(iter.Err())
        }
        return errs.Err()
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
        "context"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/util/annotations"
)

type noopQuerier struct{}

// NoopQuerier is a Querier that does nothing.
func NoopQuerier() Querier {
        return noopQuerier{}
}

func (noopQuerier) Select(context.Context, bool, *SelectHints, ...*labels.Matcher) SeriesSet {
        return NoopSeriesSet()
}

func (noopQuerier) LabelValues(context.Context, string, ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        return nil, nil, nil
}

func (noopQuerier) LabelNames(context.Context, ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        return nil, nil, nil
}

func (noopQuerier) Close() error {
        return nil
}

type noopChunkQuerier struct{}

// NoopChunkedQuerier is a ChunkQuerier that does nothing.
func NoopChunkedQuerier() ChunkQuerier {
        return noopChunkQuerier{}
}

func (noopChunkQuerier) Select(context.Context, bool, *SelectHints, ...*labels.Matcher) ChunkSeriesSet {
        return NoopChunkedSeriesSet()
}

func (noopChunkQuerier) LabelValues(context.Context, string, ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        return nil, nil, nil
}

func (noopChunkQuerier) LabelNames(context.Context, ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        return nil, nil, nil
}

func (noopChunkQuerier) Close() error {
        return nil
}

type noopSeriesSet struct{}

// NoopSeriesSet is a SeriesSet that does nothing.
func NoopSeriesSet() SeriesSet {
        return noopSeriesSet{}
}

func (noopSeriesSet) Next() bool { return false }

func (noopSeriesSet) At() Series { return nil }

func (noopSeriesSet) Err() error { return nil }

func (noopSeriesSet) Warnings() annotations.Annotations { return nil }

type noopChunkedSeriesSet struct{}

// NoopChunkedSeriesSet is a ChunkSeriesSet that does nothing.
func NoopChunkedSeriesSet() ChunkSeriesSet {
        return noopChunkedSeriesSet{}
}

func (noopChunkedSeriesSet) Next() bool { return false }

func (noopChunkedSeriesSet) At() ChunkSeries { return nil }

func (noopChunkedSeriesSet) Err() error { return nil }

func (noopChunkedSeriesSet) Warnings() annotations.Annotations { return nil }

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package azuread

import (
        "context"
        "errors"
        "fmt"
        "net/http"
        "strings"
        "sync"
        "time"

        "github.com/grafana/regexp"

        "github.com/Azure/azure-sdk-for-go/sdk/azcore"
        "github.com/Azure/azure-sdk-for-go/sdk/azcore/cloud"
        "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
        "github.com/Azure/azure-sdk-for-go/sdk/azidentity"
        "github.com/google/uuid"
)

const (
        // Clouds.
        AzureChina      = "AzureChina"
        AzureGovernment = "AzureGovernment"
        AzurePublic     = "AzurePublic"

        // Audiences.
        IngestionChinaAudience      = "https://monitor.azure.cn//.default"
        IngestionGovernmentAudience = "https://monitor.azure.us//.default"
        IngestionPublicAudience     = "https://monitor.azure.com//.default"
)

// ManagedIdentityConfig is used to store managed identity config values.
type ManagedIdentityConfig struct {
        // ClientID is the clientId of the managed identity that is being used to authenticate.
        ClientID string `yaml:"client_id,omitempty"`
}

// OAuthConfig is used to store azure oauth config values.
type OAuthConfig struct {
        // ClientID is the clientId of the azure active directory application that is being used to authenticate.
        ClientID string `yaml:"client_id,omitempty"`

        // ClientSecret is the clientSecret of the azure active directory application that is being used to authenticate.
        ClientSecret string `yaml:"client_secret,omitempty"`

        // TenantID is the tenantId of the azure active directory application that is being used to authenticate.
        TenantID string `yaml:"tenant_id,omitempty"`
}

// SDKConfig is used to store azure SDK config values.
type SDKConfig struct {
        // TenantID is the tenantId of the azure active directory application that is being used to authenticate.
        TenantID string `yaml:"tenant_id,omitempty"`
}

// AzureADConfig is used to store the config values.
type AzureADConfig struct { //nolint:revive // exported.
        // ManagedIdentity is the managed identity that is being used to authenticate.
        ManagedIdentity *ManagedIdentityConfig `yaml:"managed_identity,omitempty"`

        // OAuth is the oauth config that is being used to authenticate.
        OAuth *OAuthConfig `yaml:"oauth,omitempty"`

        // SDK is the SDK config that is being used to authenticate.
        SDK *SDKConfig `yaml:"sdk,omitempty"`

        // Cloud is the Azure cloud in which the service is running. Example: AzurePublic/AzureGovernment/AzureChina.
        Cloud string `yaml:"cloud,omitempty"`
}

// azureADRoundTripper is used to store the roundtripper and the tokenprovider.
type azureADRoundTripper struct {
        next          http.RoundTripper
        tokenProvider *tokenProvider
}

// tokenProvider is used to store and retrieve Azure AD accessToken.
type tokenProvider struct {
        // token is member used to store the current valid accessToken.
        token string
        // mu guards access to token.
        mu sync.Mutex
        // refreshTime is used to store the refresh time of the current valid accessToken.
        refreshTime time.Time
        // credentialClient is the Azure AD credential client that is being used to retrieve accessToken.
        credentialClient azcore.TokenCredential
        options          *policy.TokenRequestOptions
}

// Validate validates config values provided.
func (c *AzureADConfig) Validate() error {
        if c.Cloud == "" {
                c.Cloud = AzurePublic
        }

        if c.Cloud != AzureChina && c.Cloud != AzureGovernment && c.Cloud != AzurePublic {
                return fmt.Errorf("must provide a cloud in the Azure AD config")
        }

        if c.ManagedIdentity == nil && c.OAuth == nil && c.SDK == nil {
                return fmt.Errorf("must provide an Azure Managed Identity, Azure OAuth or Azure SDK in the Azure AD config")
        }

        if c.ManagedIdentity != nil && c.OAuth != nil {
                return fmt.Errorf("cannot provide both Azure Managed Identity and Azure OAuth in the Azure AD config")
        }

        if c.ManagedIdentity != nil && c.SDK != nil {
                return fmt.Errorf("cannot provide both Azure Managed Identity and Azure SDK in the Azure AD config")
        }

        if c.OAuth != nil && c.SDK != nil {
                return fmt.Errorf("cannot provide both Azure OAuth and Azure SDK in the Azure AD config")
        }

        if c.ManagedIdentity != nil {
                if c.ManagedIdentity.ClientID == "" {
                        return fmt.Errorf("must provide an Azure Managed Identity client_id in the Azure AD config")
                }

                _, err := uuid.Parse(c.ManagedIdentity.ClientID)
                if err != nil {
                        return fmt.Errorf("the provided Azure Managed Identity client_id is invalid")
                }
        }

        if c.OAuth != nil {
                if c.OAuth.ClientID == "" {
                        return fmt.Errorf("must provide an Azure OAuth client_id in the Azure AD config")
                }
                if c.OAuth.ClientSecret == "" {
                        return fmt.Errorf("must provide an Azure OAuth client_secret in the Azure AD config")
                }
                if c.OAuth.TenantID == "" {
                        return fmt.Errorf("must provide an Azure OAuth tenant_id in the Azure AD config")
                }

                var err error
                _, err = uuid.Parse(c.OAuth.ClientID)
                if err != nil {
                        return fmt.Errorf("the provided Azure OAuth client_id is invalid")
                }
                _, err = regexp.MatchString("^[0-9a-zA-Z-.]+$", c.OAuth.TenantID)
                if err != nil {
                        return fmt.Errorf("the provided Azure OAuth tenant_id is invalid")
                }
        }

        if c.SDK != nil {
                var err error

                if c.SDK.TenantID != "" {
                        _, err = regexp.MatchString("^[0-9a-zA-Z-.]+$", c.SDK.TenantID)
                        if err != nil {
                                return fmt.Errorf("the provided Azure OAuth tenant_id is invalid")
                        }
                }
        }

        return nil
}

// UnmarshalYAML unmarshal the Azure AD config yaml.
func (c *AzureADConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
        type plain AzureADConfig
        *c = AzureADConfig{}
        if err := unmarshal((*plain)(c)); err != nil {
                return err
        }
        return c.Validate()
}

// NewAzureADRoundTripper creates round tripper adding Azure AD authorization to calls.
func NewAzureADRoundTripper(cfg *AzureADConfig, next http.RoundTripper) (http.RoundTripper, error) {
        if next == nil {
                next = http.DefaultTransport
        }

        cred, err := newTokenCredential(cfg)
        if err != nil {
                return nil, err
        }

        tokenProvider, err := newTokenProvider(cfg, cred)
        if err != nil {
                return nil, err
        }

        rt := &azureADRoundTripper{
                next:          next,
                tokenProvider: tokenProvider,
        }
        return rt, nil
}

// RoundTrip sets Authorization header for requests.
func (rt *azureADRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
        accessToken, err := rt.tokenProvider.getAccessToken(req.Context())
        if err != nil {
                return nil, err
        }
        bearerAccessToken := "Bearer " + accessToken
        req.Header.Set("Authorization", bearerAccessToken)

        return rt.next.RoundTrip(req)
}

// newTokenCredential returns a TokenCredential of different kinds like Azure Managed Identity and Azure AD application.
func newTokenCredential(cfg *AzureADConfig) (azcore.TokenCredential, error) {
        var cred azcore.TokenCredential
        var err error
        cloudConfiguration, err := getCloudConfiguration(cfg.Cloud)
        if err != nil {
                return nil, err
        }
        clientOpts := &azcore.ClientOptions{
                Cloud: cloudConfiguration,
        }

        if cfg.ManagedIdentity != nil {
                managedIdentityConfig := &ManagedIdentityConfig{
                        ClientID: cfg.ManagedIdentity.ClientID,
                }
                cred, err = newManagedIdentityTokenCredential(clientOpts, managedIdentityConfig)
                if err != nil {
                        return nil, err
                }
        }

        if cfg.OAuth != nil {
                oAuthConfig := &OAuthConfig{
                        ClientID:     cfg.OAuth.ClientID,
                        ClientSecret: cfg.OAuth.ClientSecret,
                        TenantID:     cfg.OAuth.TenantID,
                }
                cred, err = newOAuthTokenCredential(clientOpts, oAuthConfig)
                if err != nil {
                        return nil, err
                }
        }

        if cfg.SDK != nil {
                sdkConfig := &SDKConfig{
                        TenantID: cfg.SDK.TenantID,
                }
                cred, err = newSDKTokenCredential(clientOpts, sdkConfig)
                if err != nil {
                        return nil, err
                }
        }

        return cred, nil
}

// newManagedIdentityTokenCredential returns new Managed Identity token credential.
func newManagedIdentityTokenCredential(clientOpts *azcore.ClientOptions, managedIdentityConfig *ManagedIdentityConfig) (azcore.TokenCredential, error) {
        clientID := azidentity.ClientID(managedIdentityConfig.ClientID)
        opts := &azidentity.ManagedIdentityCredentialOptions{ClientOptions: *clientOpts, ID: clientID}
        return azidentity.NewManagedIdentityCredential(opts)
}

// newOAuthTokenCredential returns new OAuth token credential.
func newOAuthTokenCredential(clientOpts *azcore.ClientOptions, oAuthConfig *OAuthConfig) (azcore.TokenCredential, error) {
        opts := &azidentity.ClientSecretCredentialOptions{ClientOptions: *clientOpts}
        return azidentity.NewClientSecretCredential(oAuthConfig.TenantID, oAuthConfig.ClientID, oAuthConfig.ClientSecret, opts)
}

// newSDKTokenCredential returns new SDK token credential.
func newSDKTokenCredential(clientOpts *azcore.ClientOptions, sdkConfig *SDKConfig) (azcore.TokenCredential, error) {
        opts := &azidentity.DefaultAzureCredentialOptions{ClientOptions: *clientOpts, TenantID: sdkConfig.TenantID}
        return azidentity.NewDefaultAzureCredential(opts)
}

// newTokenProvider helps to fetch accessToken for different types of credential. This also takes care of
// refreshing the accessToken before expiry. This accessToken is attached to the Authorization header while making requests.
func newTokenProvider(cfg *AzureADConfig, cred azcore.TokenCredential) (*tokenProvider, error) {
        audience, err := getAudience(cfg.Cloud)
        if err != nil {
                return nil, err
        }

        tokenProvider := &tokenProvider{
                credentialClient: cred,
                options:          &policy.TokenRequestOptions{Scopes: []string{audience}},
        }

        return tokenProvider, nil
}

// getAccessToken returns the current valid accessToken.
func (tokenProvider *tokenProvider) getAccessToken(ctx context.Context) (string, error) {
        tokenProvider.mu.Lock()
        defer tokenProvider.mu.Unlock()
        if tokenProvider.valid() {
                return tokenProvider.token, nil
        }
        err := tokenProvider.getToken(ctx)
        if err != nil {
                return "", errors.New("Failed to get access token: " + err.Error())
        }
        return tokenProvider.token, nil
}

// valid checks if the token in the token provider is valid and not expired.
func (tokenProvider *tokenProvider) valid() bool {
        if len(tokenProvider.token) == 0 {
                return false
        }
        if tokenProvider.refreshTime.After(time.Now().UTC()) {
                return true
        }
        return false
}

// getToken retrieves a new accessToken and stores the newly retrieved token in the tokenProvider.
func (tokenProvider *tokenProvider) getToken(ctx context.Context) error {
        accessToken, err := tokenProvider.credentialClient.GetToken(ctx, *tokenProvider.options)
        if err != nil {
                return err
        }
        if len(accessToken.Token) == 0 {
                return errors.New("access token is empty")
        }

        tokenProvider.token = accessToken.Token
        err = tokenProvider.updateRefreshTime(accessToken)
        if err != nil {
                return err
        }
        return nil
}

// updateRefreshTime handles logic to set refreshTime. The refreshTime is set at half the duration of the actual token expiry.
func (tokenProvider *tokenProvider) updateRefreshTime(accessToken azcore.AccessToken) error {
        tokenExpiryTimestamp := accessToken.ExpiresOn.UTC()
        deltaExpirytime := time.Now().Add(time.Until(tokenExpiryTimestamp) / 2)
        if deltaExpirytime.After(time.Now().UTC()) {
                tokenProvider.refreshTime = deltaExpirytime
        } else {
                return errors.New("access token expiry is less than the current time")
        }
        return nil
}

// getAudience returns audiences for different clouds.
func getAudience(cloud string) (string, error) {
        switch strings.ToLower(cloud) {
        case strings.ToLower(AzureChina):
                return IngestionChinaAudience, nil
        case strings.ToLower(AzureGovernment):
                return IngestionGovernmentAudience, nil
        case strings.ToLower(AzurePublic):
                return IngestionPublicAudience, nil
        default:
                return "", errors.New("Cloud is not specified or is incorrect: " + cloud)
        }
}

// getCloudConfiguration returns the cloud Configuration which contains AAD endpoint for different clouds.
func getCloudConfiguration(c string) (cloud.Configuration, error) {
        switch strings.ToLower(c) {
        case strings.ToLower(AzureChina):
                return cloud.AzureChina, nil
        case strings.ToLower(AzureGovernment):
                return cloud.AzureGovernment, nil
        case strings.ToLower(AzurePublic):
                return cloud.AzurePublic, nil
        default:
                return cloud.Configuration{}, errors.New("Cloud is not specified or is incorrect: " + c)
        }
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
        "context"
        "sync"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/util/annotations"
)

// secondaryQuerier is a wrapper that allows a querier to be treated in a best effort manner.
// This means that an error on any method returned by Querier except Close will be returned as a warning,
// and the result will be empty.
//
// Additionally, Querier ensures that if ANY SeriesSet returned by this querier's Select failed on an initial Next,
// All other SeriesSet will be return no response as well. This ensures consistent partial response strategy, where you
// have either full results or none from each secondary Querier.
// NOTE: This works well only for implementations that only fail during first Next() (e.g fetch from network). If implementation fails
// during further iterations, set will panic. If Select is invoked after first Next of any returned SeriesSet, querier will panic.
//
// Not go-routine safe.
// NOTE: Prometheus treats all remote storages as secondary / best effort.
type secondaryQuerier struct {
        genericQuerier

        once      sync.Once
        done      bool
        asyncSets []genericSeriesSet
}

func newSecondaryQuerierFrom(q Querier) genericQuerier {
        return &secondaryQuerier{genericQuerier: newGenericQuerierFrom(q)}
}

func newSecondaryQuerierFromChunk(cq ChunkQuerier) genericQuerier {
        return &secondaryQuerier{genericQuerier: newGenericQuerierFromChunk(cq)}
}

func (s *secondaryQuerier) LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        vals, w, err := s.genericQuerier.LabelValues(ctx, name, matchers...)
        if err != nil {
                return nil, w.Add(err), nil
        }
        return vals, w, nil
}

func (s *secondaryQuerier) LabelNames(ctx context.Context, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        names, w, err := s.genericQuerier.LabelNames(ctx, matchers...)
        if err != nil {
                return nil, w.Add(err), nil
        }
        return names, w, nil
}

func (s *secondaryQuerier) Select(ctx context.Context, sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) genericSeriesSet {
        if s.done {
                panic("secondaryQuerier: Select invoked after first Next of any returned SeriesSet was done")
        }

        s.asyncSets = append(s.asyncSets, s.genericQuerier.Select(ctx, sortSeries, hints, matchers...))
        curr := len(s.asyncSets) - 1
        return &lazyGenericSeriesSet{init: func() (genericSeriesSet, bool) {
                s.once.Do(func() {
                        // At first init invocation we iterate over all async sets and ensure its Next() returns some value without
                        // errors. This is to ensure we support consistent partial failures.
                        for i, set := range s.asyncSets {
                                if set.Next() {
                                        continue
                                }
                                ws := set.Warnings()
                                if err := set.Err(); err != nil {
                                        // One of the sets failed, ensure current one returning errors as warnings, and rest of the sets return nothing.
                                        // (All or nothing logic).
                                        s.asyncSets[curr] = warningsOnlySeriesSet(ws.Add(err))
                                        for i := range s.asyncSets {
                                                if curr == i {
                                                        continue
                                                }
                                                s.asyncSets[i] = noopGenericSeriesSet{}
                                        }
                                        break
                                }
                                // Exhausted set.
                                s.asyncSets[i] = warningsOnlySeriesSet(ws)
                        }
                        s.done = true
                })

                switch s.asyncSets[curr].(type) {
                case warningsOnlySeriesSet, noopGenericSeriesSet:
                        return s.asyncSets[curr], false
                default:
                        return s.asyncSets[curr], true
                }
        }}
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
        "fmt"
        "math"
        "sort"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
)

type SeriesEntry struct {
        Lset             labels.Labels
        SampleIteratorFn func(chunkenc.Iterator) chunkenc.Iterator
}

func (s *SeriesEntry) Labels() labels.Labels                           { return s.Lset }
func (s *SeriesEntry) Iterator(it chunkenc.Iterator) chunkenc.Iterator { return s.SampleIteratorFn(it) }

type ChunkSeriesEntry struct {
        Lset            labels.Labels
        ChunkIteratorFn func(chunks.Iterator) chunks.Iterator
}

func (s *ChunkSeriesEntry) Labels() labels.Labels                       { return s.Lset }
func (s *ChunkSeriesEntry) Iterator(it chunks.Iterator) chunks.Iterator { return s.ChunkIteratorFn(it) }

// NewListSeries returns series entry with iterator that allows to iterate over provided samples.
func NewListSeries(lset labels.Labels, s []chunks.Sample) *SeriesEntry {
        samplesS := Samples(samples(s))
        return &SeriesEntry{
                Lset: lset,
                SampleIteratorFn: func(it chunkenc.Iterator) chunkenc.Iterator {
                        if lsi, ok := it.(*listSeriesIterator); ok {
                                lsi.Reset(samplesS)
                                return lsi
                        }
                        return NewListSeriesIterator(samplesS)
                },
        }
}

// NewListChunkSeriesFromSamples returns a chunk series entry that allows to iterate over provided samples.
// NOTE: It uses an inefficient chunks encoding implementation, not caring about chunk size.
// Use only for testing.
func NewListChunkSeriesFromSamples(lset labels.Labels, samples ...[]chunks.Sample) *ChunkSeriesEntry {
        chksFromSamples := make([]chunks.Meta, 0, len(samples))
        for _, s := range samples {
                cfs, err := chunks.ChunkFromSamples(s)
                if err != nil {
                        return &ChunkSeriesEntry{
                                Lset: lset,
                                ChunkIteratorFn: func(it chunks.Iterator) chunks.Iterator {
                                        return errChunksIterator{err: err}
                                },
                        }
                }
                chksFromSamples = append(chksFromSamples, cfs)
        }
        return &ChunkSeriesEntry{
                Lset: lset,
                ChunkIteratorFn: func(it chunks.Iterator) chunks.Iterator {
                        lcsi, existing := it.(*listChunkSeriesIterator)
                        var chks []chunks.Meta
                        if existing {
                                chks = lcsi.chks[:0]
                        } else {
                                chks = make([]chunks.Meta, 0, len(samples))
                        }
                        chks = append(chks, chksFromSamples...)
                        if existing {
                                lcsi.Reset(chks...)
                                return lcsi
                        }
                        return NewListChunkSeriesIterator(chks...)
                },
        }
}

type listSeriesIterator struct {
        samples Samples
        idx     int
}

type samples []chunks.Sample

func (s samples) Get(i int) chunks.Sample { return s[i] }
func (s samples) Len() int                { return len(s) }

// Samples interface allows to work on arrays of types that are compatible with chunks.Sample.
type Samples interface {
        Get(i int) chunks.Sample
        Len() int
}

// NewListSeriesIterator returns listSeriesIterator that allows to iterate over provided samples.
func NewListSeriesIterator(samples Samples) chunkenc.Iterator {
        return &listSeriesIterator{samples: samples, idx: -1}
}

func (it *listSeriesIterator) Reset(samples Samples) {
        it.samples = samples
        it.idx = -1
}

func (it *listSeriesIterator) At() (int64, float64) {
        s := it.samples.Get(it.idx)
        return s.T(), s.F()
}

func (it *listSeriesIterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) {
        s := it.samples.Get(it.idx)
        return s.T(), s.H()
}

func (it *listSeriesIterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        s := it.samples.Get(it.idx)
        return s.T(), s.FH()
}

func (it *listSeriesIterator) AtT() int64 {
        s := it.samples.Get(it.idx)
        return s.T()
}

func (it *listSeriesIterator) Next() chunkenc.ValueType {
        it.idx++
        if it.idx >= it.samples.Len() {
                return chunkenc.ValNone
        }
        return it.samples.Get(it.idx).Type()
}

func (it *listSeriesIterator) Seek(t int64) chunkenc.ValueType {
        if it.idx == -1 {
                it.idx = 0
        }
        if it.idx >= it.samples.Len() {
                return chunkenc.ValNone
        }
        // No-op check.
        if s := it.samples.Get(it.idx); s.T() >= t {
                return s.Type()
        }
        // Do binary search between current position and end.
        it.idx += sort.Search(it.samples.Len()-it.idx, func(i int) bool {
                s := it.samples.Get(i + it.idx)
                return s.T() >= t
        })

        if it.idx >= it.samples.Len() {
                return chunkenc.ValNone
        }
        return it.samples.Get(it.idx).Type()
}

func (it *listSeriesIterator) Err() error { return nil }

type listChunkSeriesIterator struct {
        chks []chunks.Meta
        idx  int
}

// NewListChunkSeriesIterator returns listChunkSeriesIterator that allows to iterate over provided chunks.
func NewListChunkSeriesIterator(chks ...chunks.Meta) chunks.Iterator {
        return &listChunkSeriesIterator{chks: chks, idx: -1}
}

func (it *listChunkSeriesIterator) Reset(chks ...chunks.Meta) {
        it.chks = chks
        it.idx = -1
}

func (it *listChunkSeriesIterator) At() chunks.Meta {
        return it.chks[it.idx]
}

func (it *listChunkSeriesIterator) Next() bool {
        it.idx++
        return it.idx < len(it.chks)
}

func (it *listChunkSeriesIterator) Err() error { return nil }

type chunkSetToSeriesSet struct {
        ChunkSeriesSet

        iter             chunks.Iterator
        chkIterErr       error
        sameSeriesChunks []Series
}

// NewSeriesSetFromChunkSeriesSet converts ChunkSeriesSet to SeriesSet by decoding chunks one by one.
func NewSeriesSetFromChunkSeriesSet(chk ChunkSeriesSet) SeriesSet {
        return &chunkSetToSeriesSet{ChunkSeriesSet: chk}
}

func (c *chunkSetToSeriesSet) Next() bool {
        if c.Err() != nil || !c.ChunkSeriesSet.Next() {
                return false
        }

        c.iter = c.ChunkSeriesSet.At().Iterator(c.iter)
        c.sameSeriesChunks = nil

        for c.iter.Next() {
                c.sameSeriesChunks = append(
                        c.sameSeriesChunks,
                        newChunkToSeriesDecoder(c.ChunkSeriesSet.At().Labels(), c.iter.At()),
                )
        }

        if c.iter.Err() != nil {
                c.chkIterErr = c.iter.Err()
                return false
        }
        return true
}

func (c *chunkSetToSeriesSet) At() Series {
        // Series composed of same chunks for the same series.
        return ChainedSeriesMerge(c.sameSeriesChunks...)
}

func (c *chunkSetToSeriesSet) Err() error {
        if c.chkIterErr != nil {
                return c.chkIterErr
        }
        return c.ChunkSeriesSet.Err()
}

func newChunkToSeriesDecoder(labels labels.Labels, chk chunks.Meta) Series {
        return &SeriesEntry{
                Lset: labels,
                SampleIteratorFn: func(it chunkenc.Iterator) chunkenc.Iterator {
                        // TODO(bwplotka): Can we provide any chunkenc buffer?
                        return chk.Chunk.Iterator(it)
                },
        }
}

type seriesSetToChunkSet struct {
        SeriesSet
}

// NewSeriesSetToChunkSet converts SeriesSet to ChunkSeriesSet by encoding chunks from samples.
func NewSeriesSetToChunkSet(chk SeriesSet) ChunkSeriesSet {
        return &seriesSetToChunkSet{SeriesSet: chk}
}

func (c *seriesSetToChunkSet) Next() bool {
        if c.Err() != nil || !c.SeriesSet.Next() {
                return false
        }
        return true
}

func (c *seriesSetToChunkSet) At() ChunkSeries {
        return NewSeriesToChunkEncoder(c.SeriesSet.At())
}

func (c *seriesSetToChunkSet) Err() error {
        return c.SeriesSet.Err()
}

type seriesToChunkEncoder struct {
        Series
}

const seriesToChunkEncoderSplit = 120

// NewSeriesToChunkEncoder encodes samples to chunks with 120 samples limit.
func NewSeriesToChunkEncoder(series Series) ChunkSeries {
        return &seriesToChunkEncoder{series}
}

func (s *seriesToChunkEncoder) Iterator(it chunks.Iterator) chunks.Iterator {
        var (
                chk, newChk chunkenc.Chunk
                app         chunkenc.Appender
                err         error
                recoded     bool
        )
        mint := int64(math.MaxInt64)
        maxt := int64(math.MinInt64)

        var chks []chunks.Meta
        lcsi, existing := it.(*listChunkSeriesIterator)
        if existing {
                chks = lcsi.chks[:0]
        }

        i := 0
        seriesIter := s.Series.Iterator(nil)
        lastType := chunkenc.ValNone
        for typ := seriesIter.Next(); typ != chunkenc.ValNone; typ = seriesIter.Next() {
                if typ != lastType || i >= seriesToChunkEncoderSplit {
                        // Create a new chunk if the sample type changed or too many samples in the current one.
                        chks = appendChunk(chks, mint, maxt, chk)
                        chk, err = chunkenc.NewEmptyChunk(typ.ChunkEncoding())
                        if err != nil {
                                return errChunksIterator{err: err}
                        }
                        app, err = chk.Appender()
                        if err != nil {
                                return errChunksIterator{err: err}
                        }
                        mint = int64(math.MaxInt64)
                        // maxt is immediately overwritten below which is why setting it here won't make a difference.
                        i = 0
                }
                lastType = typ

                var (
                        t  int64
                        v  float64
                        h  *histogram.Histogram
                        fh *histogram.FloatHistogram
                )
                switch typ {
                case chunkenc.ValFloat:
                        t, v = seriesIter.At()
                        app.Append(t, v)
                case chunkenc.ValHistogram:
                        t, h = seriesIter.AtHistogram(nil)
                        newChk, recoded, app, err = app.AppendHistogram(nil, t, h, false)
                        if err != nil {
                                return errChunksIterator{err: err}
                        }
                        if newChk != nil {
                                if !recoded {
                                        chks = appendChunk(chks, mint, maxt, chk)
                                        mint = int64(math.MaxInt64)
                                        // maxt is immediately overwritten below which is why setting it here won't make a difference.
                                        i = 0
                                }
                                chk = newChk
                        }
                case chunkenc.ValFloatHistogram:
                        t, fh = seriesIter.AtFloatHistogram(nil)
                        newChk, recoded, app, err = app.AppendFloatHistogram(nil, t, fh, false)
                        if err != nil {
                                return errChunksIterator{err: err}
                        }
                        if newChk != nil {
                                if !recoded {
                                        chks = appendChunk(chks, mint, maxt, chk)
                                        mint = int64(math.MaxInt64)
                                        // maxt is immediately overwritten below which is why setting it here won't make a difference.
                                        i = 0
                                }
                                chk = newChk
                        }
                default:
                        return errChunksIterator{err: fmt.Errorf("unknown sample type %s", typ.String())}
                }

                maxt = t
                if mint == math.MaxInt64 {
                        mint = t
                }
                i++
        }
        if err := seriesIter.Err(); err != nil {
                return errChunksIterator{err: err}
        }

        chks = appendChunk(chks, mint, maxt, chk)

        if existing {
                lcsi.Reset(chks...)
                return lcsi
        }
        return NewListChunkSeriesIterator(chks...)
}

func appendChunk(chks []chunks.Meta, mint, maxt int64, chk chunkenc.Chunk) []chunks.Meta {
        if chk != nil {
                chks = append(chks, chunks.Meta{
                        MinTime: mint,
                        MaxTime: maxt,
                        Chunk:   chk,
                })
        }
        return chks
}

type errChunksIterator struct {
        err error
}

func (e errChunksIterator) At() chunks.Meta { return chunks.Meta{} }
func (e errChunksIterator) Next() bool      { return false }
func (e errChunksIterator) Err() error      { return e.err }

// ExpandSamples iterates over all samples in the iterator, buffering all in slice.
// Optionally it takes samples constructor, useful when you want to compare sample slices with different
// sample implementations. if nil, sample type from this package will be used.
func ExpandSamples(iter chunkenc.Iterator, newSampleFn func(t int64, f float64, h *histogram.Histogram, fh *histogram.FloatHistogram) chunks.Sample) ([]chunks.Sample, error) {
        if newSampleFn == nil {
                newSampleFn = func(t int64, f float64, h *histogram.Histogram, fh *histogram.FloatHistogram) chunks.Sample {
                        switch {
                        case h != nil:
                                return hSample{t, h}
                        case fh != nil:
                                return fhSample{t, fh}
                        default:
                                return fSample{t, f}
                        }
                }
        }

        var result []chunks.Sample
        for {
                switch iter.Next() {
                case chunkenc.ValNone:
                        return result, iter.Err()
                case chunkenc.ValFloat:
                        t, f := iter.At()
                        // NaNs can't be compared normally, so substitute for another value.
                        if math.IsNaN(f) {
                                f = -42
                        }
                        result = append(result, newSampleFn(t, f, nil, nil))
                case chunkenc.ValHistogram:
                        t, h := iter.AtHistogram(nil)
                        result = append(result, newSampleFn(t, 0, h, nil))
                case chunkenc.ValFloatHistogram:
                        t, fh := iter.AtFloatHistogram(nil)
                        result = append(result, newSampleFn(t, 0, nil, fh))
                }
        }
}

// ExpandChunks iterates over all chunks in the iterator, buffering all in slice.
func ExpandChunks(iter chunks.Iterator) ([]chunks.Meta, error) {
        var result []chunks.Meta
        for iter.Next() {
                result = append(result, iter.At())
        }
        return result, iter.Err()
}

// Copyright 2017 The Prometheus Authors

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "slices"
        "sync"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/oklog/ulid"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
        "github.com/prometheus/prometheus/tsdb/index"
        "github.com/prometheus/prometheus/tsdb/tombstones"
)

// IndexWriter serializes the index for a block of series data.
// The methods must be called in the order they are specified in.
type IndexWriter interface {
        // AddSymbol registers a single symbol.
        // Symbols must be registered in sorted order.
        AddSymbol(sym string) error

        // AddSeries populates the index writer with a series and its offsets
        // of chunks that the index can reference.
        // Implementations may require series to be insert in strictly increasing order by
        // their labels. The reference numbers are used to resolve entries in postings lists
        // that are added later.
        AddSeries(ref storage.SeriesRef, l labels.Labels, chunks ...chunks.Meta) error

        // Close writes any finalization and closes the resources associated with
        // the underlying writer.
        Close() error
}

// IndexReader provides reading access of serialized index data.
type IndexReader interface {
        // Symbols return an iterator over sorted string symbols that may occur in
        // series' labels and indices. It is not safe to use the returned strings
        // beyond the lifetime of the index reader.
        Symbols() index.StringIter

        // SortedLabelValues returns sorted possible label values.
        SortedLabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error)

        // LabelValues returns possible label values which may not be sorted.
        LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error)

        // Postings returns the postings list iterator for the label pairs.
        // The Postings here contain the offsets to the series inside the index.
        // Found IDs are not strictly required to point to a valid Series, e.g.
        // during background garbage collections.
        Postings(ctx context.Context, name string, values ...string) (index.Postings, error)

        // PostingsForLabelMatching returns a sorted iterator over postings having a label with the given name and a value for which match returns true.
        // If no postings are found having at least one matching label, an empty iterator is returned.
        PostingsForLabelMatching(ctx context.Context, name string, match func(value string) bool) index.Postings

        // SortedPostings returns a postings list that is reordered to be sorted
        // by the label set of the underlying series.
        SortedPostings(index.Postings) index.Postings

        // ShardedPostings returns a postings list filtered by the provided shardIndex
        // out of shardCount. For a given posting, its shard MUST be computed hashing
        // the series labels mod shardCount, using a hash function which is consistent over time.
        ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings

        // Series populates the given builder and chunk metas for the series identified
        // by the reference.
        // Returns storage.ErrNotFound if the ref does not resolve to a known series.
        Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error

        // LabelNames returns all the unique label names present in the index in sorted order.
        LabelNames(ctx context.Context, matchers ...*labels.Matcher) ([]string, error)

        // LabelValueFor returns label value for the given label name in the series referred to by ID.
        // If the series couldn't be found or the series doesn't have the requested label a
        // storage.ErrNotFound is returned as error.
        LabelValueFor(ctx context.Context, id storage.SeriesRef, label string) (string, error)

        // LabelNamesFor returns all the label names for the series referred to by the postings.
        // The names returned are sorted.
        LabelNamesFor(ctx context.Context, postings index.Postings) ([]string, error)

        // Close releases the underlying resources of the reader.
        Close() error
}

// ChunkWriter serializes a time block of chunked series data.
type ChunkWriter interface {
        // WriteChunks writes several chunks. The Chunk field of the ChunkMetas
        // must be populated.
        // After returning successfully, the Ref fields in the ChunkMetas
        // are set and can be used to retrieve the chunks from the written data.
        WriteChunks(chunks ...chunks.Meta) error

        // Close writes any required finalization and closes the resources
        // associated with the underlying writer.
        Close() error
}

// ChunkReader provides reading access of serialized time series data.
type ChunkReader interface {
        // ChunkOrIterable returns the series data for the given chunks.Meta.
        // Either a single chunk will be returned, or an iterable.
        // A single chunk should be returned if chunks.Meta maps to a chunk that
        // already exists and doesn't need modifications.
        // An iterable should be returned if chunks.Meta maps to a subset of the
        // samples in a stored chunk, or multiple chunks. (E.g. OOOHeadChunkReader
        // could return an iterable where multiple histogram samples have counter
        // resets. There can only be one counter reset per histogram chunk so
        // multiple chunks would be created from the iterable in this case.)
        // Only one of chunk or iterable should be returned. In some cases you may
        // always expect a chunk to be returned. You can check that iterable is nil
        // in those cases.
        ChunkOrIterable(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error)

        // Close releases all underlying resources of the reader.
        Close() error
}

// BlockReader provides reading access to a data block.
type BlockReader interface {
        // Index returns an IndexReader over the block's data.
        Index() (IndexReader, error)

        // Chunks returns a ChunkReader over the block's data.
        Chunks() (ChunkReader, error)

        // Tombstones returns a tombstones.Reader over the block's deleted data.
        Tombstones() (tombstones.Reader, error)

        // Meta provides meta information about the block reader.
        Meta() BlockMeta

        // Size returns the number of bytes that the block takes up on disk.
        Size() int64
}

// BlockMeta provides meta information about a block.
type BlockMeta struct {
        // Unique identifier for the block and its contents. Changes on compaction.
        ULID ulid.ULID `json:"ulid"`

        // MinTime and MaxTime specify the time range all samples
        // in the block are in.
        MinTime int64 `json:"minTime"`
        MaxTime int64 `json:"maxTime"`

        // Stats about the contents of the block.
        Stats BlockStats `json:"stats,omitempty"`

        // Information on compactions the block was created from.
        Compaction BlockMetaCompaction `json:"compaction"`

        // Version of the index format.
        Version int `json:"version"`
}

// BlockStats contains stats about contents of a block.
type BlockStats struct {
        NumSamples    uint64 `json:"numSamples,omitempty"`
        NumSeries     uint64 `json:"numSeries,omitempty"`
        NumChunks     uint64 `json:"numChunks,omitempty"`
        NumTombstones uint64 `json:"numTombstones,omitempty"`
}

// BlockDesc describes a block by ULID and time range.
type BlockDesc struct {
        ULID    ulid.ULID `json:"ulid"`
        MinTime int64     `json:"minTime"`
        MaxTime int64     `json:"maxTime"`
}

// BlockMetaCompaction holds information about compactions a block went through.
type BlockMetaCompaction struct {
        // Maximum number of compaction cycles any source block has
        // gone through.
        Level int `json:"level"`
        // ULIDs of all source head blocks that went into the block.
        Sources []ulid.ULID `json:"sources,omitempty"`
        // Indicates that during compaction it resulted in a block without any samples
        // so it should be deleted on the next reloadBlocks.
        Deletable bool `json:"deletable,omitempty"`
        // Short descriptions of the direct blocks that were used to create
        // this block.
        Parents []BlockDesc `json:"parents,omitempty"`
        Failed  bool        `json:"failed,omitempty"`
        // Additional information about the compaction, for example, block created from out-of-order chunks.
        Hints []string `json:"hints,omitempty"`
}

func (bm *BlockMetaCompaction) SetOutOfOrder() {
        if bm.containsHint(CompactionHintFromOutOfOrder) {
                return
        }
        bm.Hints = append(bm.Hints, CompactionHintFromOutOfOrder)
        slices.Sort(bm.Hints)
}

func (bm *BlockMetaCompaction) FromOutOfOrder() bool {
        return bm.containsHint(CompactionHintFromOutOfOrder)
}

func (bm *BlockMetaCompaction) containsHint(hint string) bool {
        for _, h := range bm.Hints {
                if h == hint {
                        return true
                }
        }
        return false
}

const (
        indexFilename = "index"
        metaFilename  = "meta.json"
        metaVersion1  = 1

        // CompactionHintFromOutOfOrder is a hint noting that the block
        // was created from out-of-order chunks.
        CompactionHintFromOutOfOrder = "from-out-of-order"
)

func chunkDir(dir string) string { return filepath.Join(dir, "chunks") }

func readMetaFile(dir string) (*BlockMeta, int64, error) {
        b, err := os.ReadFile(filepath.Join(dir, metaFilename))
        if err != nil {
                return nil, 0, err
        }
        var m BlockMeta

        if err := json.Unmarshal(b, &m); err != nil {
                return nil, 0, err
        }
        if m.Version != metaVersion1 {
                return nil, 0, fmt.Errorf("unexpected meta file version %d", m.Version)
        }

        return &m, int64(len(b)), nil
}

func writeMetaFile(logger log.Logger, dir string, meta *BlockMeta) (int64, error) {
        meta.Version = metaVersion1

        // Make any changes to the file appear atomic.
        path := filepath.Join(dir, metaFilename)
        tmp := path + ".tmp"
        defer func() {
                if err := os.RemoveAll(tmp); err != nil {
                        level.Error(logger).Log("msg", "remove tmp file", "err", err.Error())
                }
        }()

        f, err := os.Create(tmp)
        if err != nil {
                return 0, err
        }

        jsonMeta, err := json.MarshalIndent(meta, "", "\t")
        if err != nil {
                return 0, err
        }

        n, err := f.Write(jsonMeta)
        if err != nil {
                return 0, tsdb_errors.NewMulti(err, f.Close()).Err()
        }

        // Force the kernel to persist the file on disk to avoid data loss if the host crashes.
        if err := f.Sync(); err != nil {
                return 0, tsdb_errors.NewMulti(err, f.Close()).Err()
        }
        if err := f.Close(); err != nil {
                return 0, err
        }
        return int64(n), fileutil.Replace(tmp, path)
}

// Block represents a directory of time series data covering a continuous time range.
type Block struct {
        mtx            sync.RWMutex
        closing        bool
        pendingReaders sync.WaitGroup

        dir  string
        meta BlockMeta

        // Symbol Table Size in bytes.
        // We maintain this variable to avoid recalculation every time.
        symbolTableSize uint64

        chunkr     ChunkReader
        indexr     IndexReader
        tombstones tombstones.Reader

        logger log.Logger

        numBytesChunks    int64
        numBytesIndex     int64
        numBytesTombstone int64
        numBytesMeta      int64
}

// OpenBlock opens the block in the directory. It can be passed a chunk pool, which is used
// to instantiate chunk structs.
func OpenBlock(logger log.Logger, dir string, pool chunkenc.Pool) (pb *Block, err error) {
        if logger == nil {
                logger = log.NewNopLogger()
        }
        var closers []io.Closer
        defer func() {
                if err != nil {
                        err = tsdb_errors.NewMulti(err, tsdb_errors.CloseAll(closers)).Err()
                }
        }()
        meta, sizeMeta, err := readMetaFile(dir)
        if err != nil {
                return nil, err
        }

        cr, err := chunks.NewDirReader(chunkDir(dir), pool)
        if err != nil {
                return nil, err
        }
        closers = append(closers, cr)

        ir, err := index.NewFileReader(filepath.Join(dir, indexFilename))
        if err != nil {
                return nil, err
        }
        closers = append(closers, ir)

        tr, sizeTomb, err := tombstones.ReadTombstones(dir)
        if err != nil {
                return nil, err
        }
        closers = append(closers, tr)

        pb = &Block{
                dir:               dir,
                meta:              *meta,
                chunkr:            cr,
                indexr:            ir,
                tombstones:        tr,
                symbolTableSize:   ir.SymbolTableSize(),
                logger:            logger,
                numBytesChunks:    cr.Size(),
                numBytesIndex:     ir.Size(),
                numBytesTombstone: sizeTomb,
                numBytesMeta:      sizeMeta,
        }
        return pb, nil
}

// Close closes the on-disk block. It blocks as long as there are readers reading from the block.
func (pb *Block) Close() error {
        pb.mtx.Lock()
        pb.closing = true
        pb.mtx.Unlock()

        pb.pendingReaders.Wait()

        return tsdb_errors.NewMulti(
                pb.chunkr.Close(),
                pb.indexr.Close(),
                pb.tombstones.Close(),
        ).Err()
}

func (pb *Block) String() string {
        return pb.meta.ULID.String()
}

// Dir returns the directory of the block.
func (pb *Block) Dir() string { return pb.dir }

// Meta returns meta information about the block.
func (pb *Block) Meta() BlockMeta { return pb.meta }

// MinTime returns the min time of the meta.
func (pb *Block) MinTime() int64 { return pb.meta.MinTime }

// MaxTime returns the max time of the meta.
func (pb *Block) MaxTime() int64 { return pb.meta.MaxTime }

// Size returns the number of bytes that the block takes up.
func (pb *Block) Size() int64 {
        return pb.numBytesChunks + pb.numBytesIndex + pb.numBytesTombstone + pb.numBytesMeta
}

// ErrClosing is returned when a block is in the process of being closed.
var ErrClosing = errors.New("block is closing")

func (pb *Block) startRead() error {
        pb.mtx.RLock()
        defer pb.mtx.RUnlock()

        if pb.closing {
                return ErrClosing
        }
        pb.pendingReaders.Add(1)
        return nil
}

// Index returns a new IndexReader against the block data.
func (pb *Block) Index() (IndexReader, error) {
        if err := pb.startRead(); err != nil {
                return nil, err
        }
        return blockIndexReader{ir: pb.indexr, b: pb}, nil
}

// Chunks returns a new ChunkReader against the block data.
func (pb *Block) Chunks() (ChunkReader, error) {
        if err := pb.startRead(); err != nil {
                return nil, err
        }
        return blockChunkReader{ChunkReader: pb.chunkr, b: pb}, nil
}

// Tombstones returns a new TombstoneReader against the block data.
func (pb *Block) Tombstones() (tombstones.Reader, error) {
        if err := pb.startRead(); err != nil {
                return nil, err
        }
        return blockTombstoneReader{Reader: pb.tombstones, b: pb}, nil
}

// GetSymbolTableSize returns the Symbol Table Size in the index of this block.
func (pb *Block) GetSymbolTableSize() uint64 {
        return pb.symbolTableSize
}

func (pb *Block) setCompactionFailed() error {
        pb.meta.Compaction.Failed = true
        n, err := writeMetaFile(pb.logger, pb.dir, &pb.meta)
        if err != nil {
                return err
        }
        pb.numBytesMeta = n
        return nil
}

type blockIndexReader struct {
        ir IndexReader
        b  *Block
}

func (r blockIndexReader) Symbols() index.StringIter {
        return r.ir.Symbols()
}

func (r blockIndexReader) SortedLabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
        var st []string
        var err error

        if len(matchers) == 0 {
                st, err = r.ir.SortedLabelValues(ctx, name)
        } else {
                st, err = r.LabelValues(ctx, name, matchers...)
                if err == nil {
                        slices.Sort(st)
                }
        }
        if err != nil {
                return st, fmt.Errorf("block: %s: %w", r.b.Meta().ULID, err)
        }
        return st, nil
}

func (r blockIndexReader) LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
        if len(matchers) == 0 {
                st, err := r.ir.LabelValues(ctx, name)
                if err != nil {
                        return st, fmt.Errorf("block: %s: %w", r.b.Meta().ULID, err)
                }
                return st, nil
        }

        return labelValuesWithMatchers(ctx, r.ir, name, matchers...)
}

func (r blockIndexReader) LabelNames(ctx context.Context, matchers ...*labels.Matcher) ([]string, error) {
        if len(matchers) == 0 {
                return r.b.LabelNames(ctx)
        }

        return labelNamesWithMatchers(ctx, r.ir, matchers...)
}

func (r blockIndexReader) Postings(ctx context.Context, name string, values ...string) (index.Postings, error) {
        p, err := r.ir.Postings(ctx, name, values...)
        if err != nil {
                return p, fmt.Errorf("block: %s: %w", r.b.Meta().ULID, err)
        }
        return p, nil
}

func (r blockIndexReader) PostingsForLabelMatching(ctx context.Context, name string, match func(string) bool) index.Postings {
        return r.ir.PostingsForLabelMatching(ctx, name, match)
}

func (r blockIndexReader) SortedPostings(p index.Postings) index.Postings {
        return r.ir.SortedPostings(p)
}

func (r blockIndexReader) ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings {
        return r.ir.ShardedPostings(p, shardIndex, shardCount)
}

func (r blockIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
        if err := r.ir.Series(ref, builder, chks); err != nil {
                return fmt.Errorf("block: %s: %w", r.b.Meta().ULID, err)
        }
        return nil
}

func (r blockIndexReader) Close() error {
        r.b.pendingReaders.Done()
        return nil
}

// LabelValueFor returns label value for the given label name in the series referred to by ID.
func (r blockIndexReader) LabelValueFor(ctx context.Context, id storage.SeriesRef, label string) (string, error) {
        return r.ir.LabelValueFor(ctx, id, label)
}

// LabelNamesFor returns all the label names for the series referred to by the postings.
// The names returned are sorted.
func (r blockIndexReader) LabelNamesFor(ctx context.Context, postings index.Postings) ([]string, error) {
        return r.ir.LabelNamesFor(ctx, postings)
}

type blockTombstoneReader struct {
        tombstones.Reader
        b *Block
}

func (r blockTombstoneReader) Close() error {
        r.b.pendingReaders.Done()
        return nil
}

type blockChunkReader struct {
        ChunkReader
        b *Block
}

func (r blockChunkReader) Close() error {
        r.b.pendingReaders.Done()
        return nil
}

// Delete matching series between mint and maxt in the block.
func (pb *Block) Delete(ctx context.Context, mint, maxt int64, ms ...*labels.Matcher) error {
        pb.mtx.Lock()
        defer pb.mtx.Unlock()

        if pb.closing {
                return ErrClosing
        }

        p, err := PostingsForMatchers(ctx, pb.indexr, ms...)
        if err != nil {
                return fmt.Errorf("select series: %w", err)
        }

        ir := pb.indexr

        // Choose only valid postings which have chunks in the time-range.
        stones := tombstones.NewMemTombstones()

        var chks []chunks.Meta
        var builder labels.ScratchBuilder

Outer:
        for p.Next() {
                err := ir.Series(p.At(), &builder, &chks)
                if err != nil {
                        return err
                }

                for _, chk := range chks {
                        if chk.OverlapsClosedInterval(mint, maxt) {
                                // Delete only until the current values and not beyond.
                                tmin, tmax := clampInterval(mint, maxt, chks[0].MinTime, chks[len(chks)-1].MaxTime)
                                stones.AddInterval(p.At(), tombstones.Interval{Mint: tmin, Maxt: tmax})
                                continue Outer
                        }
                }
        }

        if p.Err() != nil {
                return p.Err()
        }

        err = pb.tombstones.Iter(func(id storage.SeriesRef, ivs tombstones.Intervals) error {
                for _, iv := range ivs {
                        stones.AddInterval(id, iv)
                }
                return nil
        })
        if err != nil {
                return err
        }
        pb.tombstones = stones
        pb.meta.Stats.NumTombstones = pb.tombstones.Total()

        n, err := tombstones.WriteFile(pb.logger, pb.dir, pb.tombstones)
        if err != nil {
                return err
        }
        pb.numBytesTombstone = n
        n, err = writeMetaFile(pb.logger, pb.dir, &pb.meta)
        if err != nil {
                return err
        }
        pb.numBytesMeta = n
        return nil
}

// CleanTombstones will remove the tombstones and rewrite the block (only if there are any tombstones).
// If there was a rewrite, then it returns the ULID of new blocks written, else nil.
// If a resultant block is empty (tombstones covered the whole block), then it returns an empty slice.
// It returns a boolean indicating if the parent block can be deleted safely of not.
func (pb *Block) CleanTombstones(dest string, c Compactor) ([]ulid.ULID, bool, error) {
        numStones := 0

        if err := pb.tombstones.Iter(func(id storage.SeriesRef, ivs tombstones.Intervals) error {
                numStones += len(ivs)
                return nil
        }); err != nil {
                // This should never happen, as the iteration function only returns nil.
                panic(err)
        }
        if numStones == 0 {
                return nil, false, nil
        }

        meta := pb.Meta()
        uids, err := c.Write(dest, pb, pb.meta.MinTime, pb.meta.MaxTime, &meta)
        if err != nil {
                return nil, false, err
        }

        return uids, true, nil
}

// Snapshot creates snapshot of the block into dir.
func (pb *Block) Snapshot(dir string) error {
        blockDir := filepath.Join(dir, pb.meta.ULID.String())
        if err := os.MkdirAll(blockDir, 0o777); err != nil {
                return fmt.Errorf("create snapshot block dir: %w", err)
        }

        chunksDir := chunkDir(blockDir)
        if err := os.MkdirAll(chunksDir, 0o777); err != nil {
                return fmt.Errorf("create snapshot chunk dir: %w", err)
        }

        // Hardlink meta, index and tombstones
        for _, fname := range []string{
                metaFilename,
                indexFilename,
                tombstones.TombstonesFilename,
        } {
                if err := os.Link(filepath.Join(pb.dir, fname), filepath.Join(blockDir, fname)); err != nil {
                        return fmt.Errorf("create snapshot %s: %w", fname, err)
                }
        }

        // Hardlink the chunks
        curChunkDir := chunkDir(pb.dir)
        files, err := os.ReadDir(curChunkDir)
        if err != nil {
                return fmt.Errorf("ReadDir the current chunk dir: %w", err)
        }

        for _, f := range files {
                err := os.Link(filepath.Join(curChunkDir, f.Name()), filepath.Join(chunksDir, f.Name()))
                if err != nil {
                        return fmt.Errorf("hardlink a chunk: %w", err)
                }
        }

        return nil
}

// OverlapsClosedInterval returns true if the block overlaps [mint, maxt].
func (pb *Block) OverlapsClosedInterval(mint, maxt int64) bool {
        // The block itself is a half-open interval
        // [pb.meta.MinTime, pb.meta.MaxTime).
        return pb.meta.MinTime <= maxt && mint < pb.meta.MaxTime
}

// LabelNames returns all the unique label names present in the Block in sorted order.
func (pb *Block) LabelNames(ctx context.Context) ([]string, error) {
        return pb.indexr.LabelNames(ctx)
}

func clampInterval(a, b, mint, maxt int64) (int64, int64) {
        if a < mint {
                a = mint
        }
        if b > maxt {
                b = maxt
        }
        return a, b
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "errors"
        "fmt"
        "math"
        "os"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/oklog/ulid"

        "github.com/prometheus/prometheus/model/timestamp"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
)

// BlockWriter is a block writer that allows appending and flushing series to disk.
type BlockWriter struct {
        logger         log.Logger
        destinationDir string

        head      *Head
        blockSize int64 // in ms
        chunkDir  string
}

// ErrNoSeriesAppended is returned if the series count is zero while flushing blocks.
var ErrNoSeriesAppended = errors.New("no series appended, aborting")

// NewBlockWriter creates a new block writer.
//
// The returned writer accumulates all the series in the Head block until `Flush` is called.
//
// Note that the writer will not check if the target directory exists or
// contains anything at all. It is the caller's responsibility to
// ensure that the resulting blocks do not overlap etc.
// Writer ensures the block flush is atomic (via rename).
func NewBlockWriter(logger log.Logger, dir string, blockSize int64) (*BlockWriter, error) {
        w := &BlockWriter{
                logger:         logger,
                destinationDir: dir,
                blockSize:      blockSize,
        }
        if err := w.initHead(); err != nil {
                return nil, err
        }
        return w, nil
}

// initHead creates and initialises a new TSDB head.
func (w *BlockWriter) initHead() error {
        chunkDir, err := os.MkdirTemp(os.TempDir(), "head")
        if err != nil {
                return fmt.Errorf("create temp dir: %w", err)
        }
        w.chunkDir = chunkDir
        opts := DefaultHeadOptions()
        opts.ChunkRange = w.blockSize
        opts.ChunkDirRoot = w.chunkDir
        opts.EnableNativeHistograms.Store(true)
        h, err := NewHead(nil, w.logger, nil, nil, opts, NewHeadStats())
        if err != nil {
                return fmt.Errorf("tsdb.NewHead: %w", err)
        }

        w.head = h
        return w.head.Init(math.MinInt64)
}

// Appender returns a new appender on the database.
// Appender can't be called concurrently. However, the returned Appender can safely be used concurrently.
func (w *BlockWriter) Appender(ctx context.Context) storage.Appender {
        return w.head.Appender(ctx)
}

// Flush implements the Writer interface. This is where actual block writing
// happens. After flush completes, no writes can be done.
func (w *BlockWriter) Flush(ctx context.Context) (ulid.ULID, error) {
        mint := w.head.MinTime()
        // Add +1 millisecond to block maxt because block intervals are half-open: [b.MinTime, b.MaxTime).
        // Because of this block intervals are always +1 than the total samples it includes.
        maxt := w.head.MaxTime() + 1
        level.Info(w.logger).Log("msg", "flushing", "series_count", w.head.NumSeries(), "mint", timestamp.Time(mint), "maxt", timestamp.Time(maxt))

        compactor, err := NewLeveledCompactor(ctx,
                nil,
                w.logger,
                []int64{w.blockSize},
                chunkenc.NewPool(), nil)
        if err != nil {
                return ulid.ULID{}, fmt.Errorf("create leveled compactor: %w", err)
        }
        ids, err := compactor.Write(w.destinationDir, w.head, mint, maxt, nil)
        if err != nil {
                return ulid.ULID{}, fmt.Errorf("compactor write: %w", err)
        }

        // No block was produced. Caller is responsible to check empty
        // ulid.ULID based on its use case.
        if len(ids) == 0 {
                return ulid.ULID{}, nil
        }
        return ids[0], nil
}

func (w *BlockWriter) Close() error {
        defer func() {
                if err := os.RemoveAll(w.chunkDir); err != nil {
                        level.Error(w.logger).Log("msg", "error in deleting BlockWriter files", "err", err)
                }
        }()
        return w.head.Close()
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// The code in this file was largely written by Damian Gryski as part of
// https://github.com/dgryski/go-tsz and published under the license below.
// It received minor modifications to suit Prometheus's needs.

// Copyright (c) 2015,2016 Damian Gryski <damian@gryski.com>
// All rights reserved.

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:

// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

package chunkenc

import (
        "encoding/binary"
        "io"
)

// bstream is a stream of bits.
type bstream struct {
        stream []byte // The data stream.
        count  uint8  // How many right-most bits are available for writing in the current byte (the last byte of the stream).
}

// Reset resets b around stream.
func (b *bstream) Reset(stream []byte) {
        b.stream = stream
        b.count = 0
}

func (b *bstream) bytes() []byte {
        return b.stream
}

type bit bool

const (
        zero bit = false
        one  bit = true
)

func (b *bstream) writeBit(bit bit) {
        if b.count == 0 {
                b.stream = append(b.stream, 0)
                b.count = 8
        }

        i := len(b.stream) - 1

        if bit {
                b.stream[i] |= 1 << (b.count - 1)
        }

        b.count--
}

func (b *bstream) writeByte(byt byte) {
        if b.count == 0 {
                b.stream = append(b.stream, 0)
                b.count = 8
        }

        i := len(b.stream) - 1

        // Complete the last byte with the leftmost b.count bits from byt.
        b.stream[i] |= byt >> (8 - b.count)

        b.stream = append(b.stream, 0)
        i++
        // Write the remainder, if any.
        b.stream[i] = byt << b.count
}

// writeBits writes the nbits right-most bits of u to the stream
// in left-to-right order.
func (b *bstream) writeBits(u uint64, nbits int) {
        u <<= 64 - uint(nbits)
        for nbits >= 8 {
                byt := byte(u >> 56)
                b.writeByte(byt)
                u <<= 8
                nbits -= 8
        }

        for nbits > 0 {
                b.writeBit((u >> 63) == 1)
                u <<= 1
                nbits--
        }
}

type bstreamReader struct {
        stream       []byte
        streamOffset int // The offset from which read the next byte from the stream.

        buffer uint64 // The current buffer, filled from the stream, containing up to 8 bytes from which read bits.
        valid  uint8  // The number of right-most bits valid to read (from left) in the current 8 byte buffer.
        last   byte   // A copy of the last byte of the stream.
}

func newBReader(b []byte) bstreamReader {
        // The last byte of the stream can be updated later, so we take a copy.
        var last byte
        if len(b) > 0 {
                last = b[len(b)-1]
        }
        return bstreamReader{
                stream: b,
                last:   last,
        }
}

func (b *bstreamReader) readBit() (bit, error) {
        if b.valid == 0 {
                if !b.loadNextBuffer(1) {
                        return false, io.EOF
                }
        }

        return b.readBitFast()
}

// readBitFast is like readBit but can return io.EOF if the internal buffer is empty.
// If it returns io.EOF, the caller should retry reading bits calling readBit().
// This function must be kept small and a leaf in order to help the compiler inlining it
// and further improve performances.
func (b *bstreamReader) readBitFast() (bit, error) {
        if b.valid == 0 {
                return false, io.EOF
        }

        b.valid--
        bitmask := uint64(1) << b.valid
        return (b.buffer & bitmask) != 0, nil
}

// readBits constructs a uint64 with the nbits right-most bits
// read from the stream, and any other bits 0.
func (b *bstreamReader) readBits(nbits uint8) (uint64, error) {
        if b.valid == 0 {
                if !b.loadNextBuffer(nbits) {
                        return 0, io.EOF
                }
        }

        if nbits <= b.valid {
                return b.readBitsFast(nbits)
        }

        // We have to read all remaining valid bits from the current buffer and a part from the next one.
        bitmask := (uint64(1) << b.valid) - 1
        nbits -= b.valid
        v := (b.buffer & bitmask) << nbits
        b.valid = 0

        if !b.loadNextBuffer(nbits) {
                return 0, io.EOF
        }

        bitmask = (uint64(1) << nbits) - 1
        v |= ((b.buffer >> (b.valid - nbits)) & bitmask)
        b.valid -= nbits

        return v, nil
}

// readBitsFast is like readBits but can return io.EOF if the internal buffer is empty.
// If it returns io.EOF, the caller should retry reading bits calling readBits().
// This function must be kept small and a leaf in order to help the compiler inlining it
// and further improve performances.
func (b *bstreamReader) readBitsFast(nbits uint8) (uint64, error) {
        if nbits > b.valid {
                return 0, io.EOF
        }

        bitmask := (uint64(1) << nbits) - 1
        b.valid -= nbits

        return (b.buffer >> b.valid) & bitmask, nil
}

func (b *bstreamReader) ReadByte() (byte, error) {
        v, err := b.readBits(8)
        if err != nil {
                return 0, err
        }
        return byte(v), nil
}

// loadNextBuffer loads the next bytes from the stream into the internal buffer.
// The input nbits is the minimum number of bits that must be read, but the implementation
// can read more (if possible) to improve performances.
func (b *bstreamReader) loadNextBuffer(nbits uint8) bool {
        if b.streamOffset >= len(b.stream) {
                return false
        }

        // Handle the case there are more then 8 bytes in the buffer (most common case)
        // in a optimized way. It's guaranteed that this branch will never read from the
        // very last byte of the stream (which suffers race conditions due to concurrent
        // writes).
        if b.streamOffset+8 < len(b.stream) {
                b.buffer = binary.BigEndian.Uint64(b.stream[b.streamOffset:])
                b.streamOffset += 8
                b.valid = 64
                return true
        }

        // We're here if there are 8 or less bytes left in the stream.
        // The following code is slower but called less frequently.
        nbytes := int((nbits / 8) + 1)
        if b.streamOffset+nbytes > len(b.stream) {
                nbytes = len(b.stream) - b.streamOffset
        }

        buffer := uint64(0)
        skip := 0
        if b.streamOffset+nbytes == len(b.stream) {
                // There can be concurrent writes happening on the very last byte
                // of the stream, so use the copy we took at initialization time.
                buffer |= uint64(b.last)
                // Read up to the byte before
                skip = 1
        }

        for i := 0; i < nbytes-skip; i++ {
                buffer |= (uint64(b.stream[b.streamOffset+i]) << uint(8*(nbytes-i-1)))
        }

        b.buffer = buffer
        b.streamOffset += nbytes
        b.valid = uint8(nbytes * 8)

        return true
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunkenc

import (
        "fmt"
        "math"
        "sync"

        "github.com/prometheus/prometheus/model/histogram"
)

// Encoding is the identifier for a chunk encoding.
type Encoding uint8

// The different available chunk encodings.
const (
        EncNone Encoding = iota
        EncXOR
        EncHistogram
        EncFloatHistogram
)

func (e Encoding) String() string {
        switch e {
        case EncNone:
                return "none"
        case EncXOR:
                return "XOR"
        case EncHistogram:
                return "histogram"
        case EncFloatHistogram:
                return "floathistogram"
        }
        return "<unknown>"
}

// IsValidEncoding returns true for supported encodings.
func IsValidEncoding(e Encoding) bool {
        return e == EncXOR || e == EncHistogram || e == EncFloatHistogram
}

const (
        // MaxBytesPerXORChunk is the maximum size an XOR chunk can be.
        MaxBytesPerXORChunk = 1024
        // TargetBytesPerHistogramChunk sets a size target for each histogram chunk.
        TargetBytesPerHistogramChunk = 1024
        // MinSamplesPerHistogramChunk sets a minimum sample count for histogram chunks. This is desirable because a single
        // histogram sample can be larger than TargetBytesPerHistogramChunk but we want to avoid too-small sample count
        // chunks so we can achieve some measure of compression advantage even while dealing with really large histograms.
        // Note that this minimum sample count is not enforced across chunk range boundaries (for example, if the chunk
        // range is 100 and the first sample in the chunk range is 99, the next sample will be included in a new chunk
        // resulting in the old chunk containing only a single sample).
        MinSamplesPerHistogramChunk = 10
)

// Chunk holds a sequence of sample pairs that can be iterated over and appended to.
type Chunk interface {
        Iterable

        // Bytes returns the underlying byte slice of the chunk.
        Bytes() []byte

        // Encoding returns the encoding type of the chunk.
        Encoding() Encoding

        // Appender returns an appender to append samples to the chunk.
        Appender() (Appender, error)

        // NumSamples returns the number of samples in the chunk.
        NumSamples() int

        // Compact is called whenever a chunk is expected to be complete (no more
        // samples appended) and the underlying implementation can eventually
        // optimize the chunk.
        // There's no strong guarantee that no samples will be appended once
        // Compact() is called. Implementing this function is optional.
        Compact()

        // Reset resets the chunk given stream.
        Reset(stream []byte)
}

type Iterable interface {
        // The iterator passed as argument is for re-use.
        // Depending on implementation, the iterator can
        // be re-used or a new iterator can be allocated.
        Iterator(Iterator) Iterator
}

// Appender adds sample pairs to a chunk.
type Appender interface {
        Append(int64, float64)

        // AppendHistogram and AppendFloatHistogram append a histogram sample to a histogram or float histogram chunk.
        // Appending a histogram may require creating a completely new chunk or recoding (changing) the current chunk.
        // The Appender prev is used to determine if there is a counter reset between the previous Appender and the current Appender.
        // The Appender prev is optional and only taken into account when the first sample is being appended.
        // The bool appendOnly governs what happens when a sample cannot be appended to the current chunk. If appendOnly is true, then
        // in such case an error is returned without modifying the chunk. If appendOnly is false, then a new chunk is created or the
        // current chunk is recoded to accommodate the sample.
        // The returned Chunk c is nil if sample could be appended to the current Chunk, otherwise c is the new Chunk.
        // The returned bool isRecoded can be used to distinguish between the new Chunk c being a completely new Chunk
        // or the current Chunk recoded to a new Chunk.
        // The Appender app that can be used for the next append is always returned.
        AppendHistogram(prev *HistogramAppender, t int64, h *histogram.Histogram, appendOnly bool) (c Chunk, isRecoded bool, app Appender, err error)
        AppendFloatHistogram(prev *FloatHistogramAppender, t int64, h *histogram.FloatHistogram, appendOnly bool) (c Chunk, isRecoded bool, app Appender, err error)
}

// Iterator is a simple iterator that can only get the next value.
// Iterator iterates over the samples of a time series, in timestamp-increasing order.
type Iterator interface {
        // Next advances the iterator by one and returns the type of the value
        // at the new position (or ValNone if the iterator is exhausted).
        Next() ValueType
        // Seek advances the iterator forward to the first sample with a
        // timestamp equal or greater than t. If the current sample found by a
        // previous `Next` or `Seek` operation already has this property, Seek
        // has no effect. If a sample has been found, Seek returns the type of
        // its value. Otherwise, it returns ValNone, after which the iterator is
        // exhausted.
        Seek(t int64) ValueType
        // At returns the current timestamp/value pair if the value is a float.
        // Before the iterator has advanced, the behaviour is unspecified.
        At() (int64, float64)
        // AtHistogram returns the current timestamp/value pair if the value is a
        // histogram with integer counts. Before the iterator has advanced, the behaviour
        // is unspecified.
        // The method accepts an optional Histogram object which will be
        // reused when not nil. Otherwise, a new Histogram object will be allocated.
        AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram)
        // AtFloatHistogram returns the current timestamp/value pair if the
        // value is a histogram with floating-point counts. It also works if the
        // value is a histogram with integer counts, in which case a
        // FloatHistogram copy of the histogram is returned. Before the iterator
        // has advanced, the behaviour is unspecified.
        // The method accepts an optional FloatHistogram object which will be
        // reused when not nil. Otherwise, a new FloatHistogram object will be allocated.
        AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram)
        // AtT returns the current timestamp.
        // Before the iterator has advanced, the behaviour is unspecified.
        AtT() int64
        // Err returns the current error. It should be used only after the
        // iterator is exhausted, i.e. `Next` or `Seek` have returned ValNone.
        Err() error
}

// ValueType defines the type of a value an Iterator points to.
type ValueType uint8

// Possible values for ValueType.
const (
        ValNone           ValueType = iota // No value at the current position.
        ValFloat                           // A simple float, retrieved with At.
        ValHistogram                       // A histogram, retrieve with AtHistogram, but AtFloatHistogram works, too.
        ValFloatHistogram                  // A floating-point histogram, retrieve with AtFloatHistogram.
)

func (v ValueType) String() string {
        switch v {
        case ValNone:
                return "none"
        case ValFloat:
                return "float"
        case ValHistogram:
                return "histogram"
        case ValFloatHistogram:
                return "floathistogram"
        default:
                return "unknown"
        }
}

func (v ValueType) ChunkEncoding() Encoding {
        switch v {
        case ValFloat:
                return EncXOR
        case ValHistogram:
                return EncHistogram
        case ValFloatHistogram:
                return EncFloatHistogram
        default:
                return EncNone
        }
}

func (v ValueType) NewChunk() (Chunk, error) {
        switch v {
        case ValFloat:
                return NewXORChunk(), nil
        case ValHistogram:
                return NewHistogramChunk(), nil
        case ValFloatHistogram:
                return NewFloatHistogramChunk(), nil
        default:
                return nil, fmt.Errorf("value type %v unsupported", v)
        }
}

// MockSeriesIterator returns an iterator for a mock series with custom timeStamps and values.
func MockSeriesIterator(timestamps []int64, values []float64) Iterator {
        return &mockSeriesIterator{
                timeStamps: timestamps,
                values:     values,
                currIndex:  0,
        }
}

type mockSeriesIterator struct {
        timeStamps []int64
        values     []float64
        currIndex  int
}

func (it *mockSeriesIterator) Seek(int64) ValueType { return ValNone }

func (it *mockSeriesIterator) At() (int64, float64) {
        return it.timeStamps[it.currIndex], it.values[it.currIndex]
}

func (it *mockSeriesIterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) {
        return math.MinInt64, nil
}

func (it *mockSeriesIterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        return math.MinInt64, nil
}

func (it *mockSeriesIterator) AtT() int64 {
        return it.timeStamps[it.currIndex]
}

func (it *mockSeriesIterator) Next() ValueType {
        if it.currIndex < len(it.timeStamps)-1 {
                it.currIndex++
                return ValFloat
        }

        return ValNone
}
func (it *mockSeriesIterator) Err() error { return nil }

// NewNopIterator returns a new chunk iterator that does not hold any data.
func NewNopIterator() Iterator {
        return nopIterator{}
}

type nopIterator struct{}

func (nopIterator) Next() ValueType      { return ValNone }
func (nopIterator) Seek(int64) ValueType { return ValNone }
func (nopIterator) At() (int64, float64) { return math.MinInt64, 0 }
func (nopIterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) {
        return math.MinInt64, nil
}

func (nopIterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        return math.MinInt64, nil
}
func (nopIterator) AtT() int64 { return math.MinInt64 }
func (nopIterator) Err() error { return nil }

// Pool is used to create and reuse chunk references to avoid allocations.
type Pool interface {
        Put(Chunk) error
        Get(e Encoding, b []byte) (Chunk, error)
}

// pool is a memory pool of chunk objects.
type pool struct {
        xor            sync.Pool
        histogram      sync.Pool
        floatHistogram sync.Pool
}

// NewPool returns a new pool.
func NewPool() Pool {
        return &pool{
                xor: sync.Pool{
                        New: func() interface{} {
                                return &XORChunk{b: bstream{}}
                        },
                },
                histogram: sync.Pool{
                        New: func() interface{} {
                                return &HistogramChunk{b: bstream{}}
                        },
                },
                floatHistogram: sync.Pool{
                        New: func() interface{} {
                                return &FloatHistogramChunk{b: bstream{}}
                        },
                },
        }
}

func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
        var c Chunk
        switch e {
        case EncXOR:
                c = p.xor.Get().(*XORChunk)
        case EncHistogram:
                c = p.histogram.Get().(*HistogramChunk)
        case EncFloatHistogram:
                c = p.floatHistogram.Get().(*FloatHistogramChunk)
        default:
                return nil, fmt.Errorf("invalid chunk encoding %q", e)
        }

        c.Reset(b)
        return c, nil
}

func (p *pool) Put(c Chunk) error {
        var sp *sync.Pool
        var ok bool
        switch c.Encoding() {
        case EncXOR:
                _, ok = c.(*XORChunk)
                sp = &p.xor
        case EncHistogram:
                _, ok = c.(*HistogramChunk)
                sp = &p.histogram
        case EncFloatHistogram:
                _, ok = c.(*FloatHistogramChunk)
                sp = &p.floatHistogram
        default:
                return fmt.Errorf("invalid chunk encoding %q", c.Encoding())
        }
        if !ok {
                // This may happen often with wrapped chunks. Nothing we can really do about
                // it but returning an error would cause a lot of allocations again. Thus,
                // we just skip it.
                return nil
        }

        c.Reset(nil)
        sp.Put(c)
        return nil
}

// FromData returns a chunk from a byte slice of chunk data.
// This is there so that users of the library can easily create chunks from
// bytes.
func FromData(e Encoding, d []byte) (Chunk, error) {
        switch e {
        case EncXOR:
                return &XORChunk{b: bstream{count: 0, stream: d}}, nil
        case EncHistogram:
                return &HistogramChunk{b: bstream{count: 0, stream: d}}, nil
        case EncFloatHistogram:
                return &FloatHistogramChunk{b: bstream{count: 0, stream: d}}, nil
        }
        return nil, fmt.Errorf("invalid chunk encoding %q", e)
}

// NewEmptyChunk returns an empty chunk for the given encoding.
func NewEmptyChunk(e Encoding) (Chunk, error) {
        switch e {
        case EncXOR:
                return NewXORChunk(), nil
        case EncHistogram:
                return NewHistogramChunk(), nil
        case EncFloatHistogram:
                return NewFloatHistogramChunk(), nil
        }
        return nil, fmt.Errorf("invalid chunk encoding %q", e)
}

// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunkenc

import (
        "encoding/binary"
        "fmt"
        "math"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/value"
)

// FloatHistogramChunk holds encoded sample data for a sparse, high-resolution
// float histogram.
//
// Each sample has multiple "fields", stored in the following way (raw = store
// number directly, delta = store delta to the previous number, dod = store
// delta of the delta to the previous number, xor = what we do for regular
// sample values):
//
//        field →    ts    count zeroCount sum []posbuckets []negbuckets
//        sample 1   raw   raw   raw       raw []raw        []raw
//        sample 2   delta xor   xor       xor []xor        []xor
//        sample >2  dod   xor   xor       xor []xor        []xor
type FloatHistogramChunk struct {
        b bstream
}

// NewFloatHistogramChunk returns a new chunk with float histogram encoding.
func NewFloatHistogramChunk() *FloatHistogramChunk {
        b := make([]byte, 3, 128)
        return &FloatHistogramChunk{b: bstream{stream: b, count: 0}}
}

func (c *FloatHistogramChunk) Reset(stream []byte) {
        c.b.Reset(stream)
}

// xorValue holds all the necessary information to encode
// and decode XOR encoded float64 values.
type xorValue struct {
        value    float64
        leading  uint8
        trailing uint8
}

// Encoding returns the encoding type.
func (c *FloatHistogramChunk) Encoding() Encoding {
        return EncFloatHistogram
}

// Bytes returns the underlying byte slice of the chunk.
func (c *FloatHistogramChunk) Bytes() []byte {
        return c.b.bytes()
}

// NumSamples returns the number of samples in the chunk.
func (c *FloatHistogramChunk) NumSamples() int {
        return int(binary.BigEndian.Uint16(c.Bytes()))
}

// Layout returns the histogram layout. Only call this on chunks that have at
// least one sample.
func (c *FloatHistogramChunk) Layout() (
        schema int32, zeroThreshold float64,
        negativeSpans, positiveSpans []histogram.Span,
        customValues []float64,
        err error,
) {
        if c.NumSamples() == 0 {
                panic("FloatHistogramChunk.Layout() called on an empty chunk")
        }
        b := newBReader(c.Bytes()[2:])
        return readHistogramChunkLayout(&b)
}

// GetCounterResetHeader returns the info about the first 2 bits of the chunk
// header.
func (c *FloatHistogramChunk) GetCounterResetHeader() CounterResetHeader {
        return CounterResetHeader(c.Bytes()[2] & CounterResetHeaderMask)
}

// Compact implements the Chunk interface.
func (c *FloatHistogramChunk) Compact() {
        if l := len(c.b.stream); cap(c.b.stream) > l+chunkCompactCapacityThreshold {
                buf := make([]byte, l)
                copy(buf, c.b.stream)
                c.b.stream = buf
        }
}

// Appender implements the Chunk interface.
func (c *FloatHistogramChunk) Appender() (Appender, error) {
        it := c.iterator(nil)

        // To get an appender, we must know the state it would have if we had
        // appended all existing data from scratch. We iterate through the end
        // and populate via the iterator's state.
        for it.Next() == ValFloatHistogram {
        }
        if err := it.Err(); err != nil {
                return nil, err
        }

        pBuckets := make([]xorValue, len(it.pBuckets))
        for i := 0; i < len(it.pBuckets); i++ {
                pBuckets[i] = xorValue{
                        value:    it.pBuckets[i],
                        leading:  it.pBucketsLeading[i],
                        trailing: it.pBucketsTrailing[i],
                }
        }
        nBuckets := make([]xorValue, len(it.nBuckets))
        for i := 0; i < len(it.nBuckets); i++ {
                nBuckets[i] = xorValue{
                        value:    it.nBuckets[i],
                        leading:  it.nBucketsLeading[i],
                        trailing: it.nBucketsTrailing[i],
                }
        }

        a := &FloatHistogramAppender{
                b: &c.b,

                schema:       it.schema,
                zThreshold:   it.zThreshold,
                pSpans:       it.pSpans,
                nSpans:       it.nSpans,
                customValues: it.customValues,
                t:            it.t,
                tDelta:       it.tDelta,
                cnt:          it.cnt,
                zCnt:         it.zCnt,
                pBuckets:     pBuckets,
                nBuckets:     nBuckets,
                sum:          it.sum,
        }
        if it.numTotal == 0 {
                a.sum.leading = 0xff
                a.cnt.leading = 0xff
                a.zCnt.leading = 0xff
        }
        return a, nil
}

func (c *FloatHistogramChunk) iterator(it Iterator) *floatHistogramIterator {
        // This comment is copied from XORChunk.iterator:
        //   Should iterators guarantee to act on a copy of the data so it doesn't lock append?
        //   When using striped locks to guard access to chunks, probably yes.
        //   Could only copy data if the chunk is not completed yet.
        if histogramIter, ok := it.(*floatHistogramIterator); ok {
                histogramIter.Reset(c.b.bytes())
                return histogramIter
        }
        return newFloatHistogramIterator(c.b.bytes())
}

func newFloatHistogramIterator(b []byte) *floatHistogramIterator {
        it := &floatHistogramIterator{
                br:       newBReader(b),
                numTotal: binary.BigEndian.Uint16(b),
                t:        math.MinInt64,
        }
        // The first 3 bytes contain chunk headers.
        // We skip that for actual samples.
        _, _ = it.br.readBits(24)
        it.counterResetHeader = CounterResetHeader(b[2] & CounterResetHeaderMask)
        return it
}

// Iterator implements the Chunk interface.
func (c *FloatHistogramChunk) Iterator(it Iterator) Iterator {
        return c.iterator(it)
}

// FloatHistogramAppender is an Appender implementation for float histograms.
type FloatHistogramAppender struct {
        b *bstream

        // Layout:
        schema         int32
        zThreshold     float64
        pSpans, nSpans []histogram.Span
        customValues   []float64

        t, tDelta          int64
        sum, cnt, zCnt     xorValue
        pBuckets, nBuckets []xorValue
}

func (a *FloatHistogramAppender) GetCounterResetHeader() CounterResetHeader {
        return CounterResetHeader(a.b.bytes()[2] & CounterResetHeaderMask)
}

func (a *FloatHistogramAppender) setCounterResetHeader(cr CounterResetHeader) {
        a.b.bytes()[2] = (a.b.bytes()[2] & (^CounterResetHeaderMask)) | (byte(cr) & CounterResetHeaderMask)
}

func (a *FloatHistogramAppender) NumSamples() int {
        return int(binary.BigEndian.Uint16(a.b.bytes()))
}

// Append implements Appender. This implementation panics because normal float
// samples must never be appended to a histogram chunk.
func (a *FloatHistogramAppender) Append(int64, float64) {
        panic("appended a float sample to a histogram chunk")
}

// appendable returns whether the chunk can be appended to, and if so whether
// any recoding needs to happen using the provided inserts (in case of any new
// buckets, positive or negative range, respectively). If the sample is a gauge
// histogram, AppendableGauge must be used instead.
//
// The chunk is not appendable in the following cases:
//   - The schema has changed.
//   - The custom bounds have changed if the current schema is custom buckets.
//   - The threshold for the zero bucket has changed.
//   - Any buckets have disappeared.
//   - There was a counter reset in the count of observations or in any bucket, including the zero bucket.
//   - The last sample in the chunk was stale while the current sample is not stale.
//
// The method returns an additional boolean set to true if it is not appendable
// because of a counter reset. If the given sample is stale, it is always ok to
// append. If counterReset is true, okToAppend is always false.
func (a *FloatHistogramAppender) appendable(h *histogram.FloatHistogram) (
        positiveInserts, negativeInserts []Insert,
        okToAppend, counterReset bool,
) {
        if a.NumSamples() > 0 && a.GetCounterResetHeader() == GaugeType {
                return
        }
        if h.CounterResetHint == histogram.CounterReset {
                // Always honor the explicit counter reset hint.
                counterReset = true
                return
        }
        if value.IsStaleNaN(h.Sum) {
                // This is a stale sample whose buckets and spans don't matter.
                okToAppend = true
                return
        }
        if value.IsStaleNaN(a.sum.value) {
                // If the last sample was stale, then we can only accept stale
                // samples in this chunk.
                return
        }

        if h.Count < a.cnt.value {
                // There has been a counter reset.
                counterReset = true
                return
        }

        if h.Schema != a.schema || h.ZeroThreshold != a.zThreshold {
                return
        }

        if histogram.IsCustomBucketsSchema(h.Schema) && !histogram.FloatBucketsMatch(h.CustomValues, a.customValues) {
                counterReset = true
                return
        }

        if h.ZeroCount < a.zCnt.value {
                // There has been a counter reset since ZeroThreshold didn't change.
                counterReset = true
                return
        }

        var ok bool
        positiveInserts, ok = expandSpansForward(a.pSpans, h.PositiveSpans)
        if !ok {
                counterReset = true
                return
        }
        negativeInserts, ok = expandSpansForward(a.nSpans, h.NegativeSpans)
        if !ok {
                counterReset = true
                return
        }

        if counterResetInAnyFloatBucket(a.pBuckets, h.PositiveBuckets, a.pSpans, h.PositiveSpans) ||
                counterResetInAnyFloatBucket(a.nBuckets, h.NegativeBuckets, a.nSpans, h.NegativeSpans) {
                counterReset, positiveInserts, negativeInserts = true, nil, nil
                return
        }

        okToAppend = true
        return
}

// appendableGauge returns whether the chunk can be appended to, and if so
// whether:
//  1. Any recoding needs to happen to the chunk using the provided inserts
//     (in case of any new buckets, positive or negative range, respectively).
//  2. Any recoding needs to happen for the histogram being appended, using the
//     backward inserts (in case of any missing buckets, positive or negative
//     range, respectively).
//
// This method must be only used for gauge histograms.
//
// The chunk is not appendable in the following cases:
//   - The schema has changed.
//   - The custom bounds have changed if the current schema is custom buckets.
//   - The threshold for the zero bucket has changed.
//   - The last sample in the chunk was stale while the current sample is not stale.
func (a *FloatHistogramAppender) appendableGauge(h *histogram.FloatHistogram) (
        positiveInserts, negativeInserts []Insert,
        backwardPositiveInserts, backwardNegativeInserts []Insert,
        positiveSpans, negativeSpans []histogram.Span,
        okToAppend bool,
) {
        if a.NumSamples() > 0 && a.GetCounterResetHeader() != GaugeType {
                return
        }
        if value.IsStaleNaN(h.Sum) {
                // This is a stale sample whose buckets and spans don't matter.
                okToAppend = true
                return
        }
        if value.IsStaleNaN(a.sum.value) {
                // If the last sample was stale, then we can only accept stale
                // samples in this chunk.
                return
        }

        if h.Schema != a.schema || h.ZeroThreshold != a.zThreshold {
                return
        }

        if histogram.IsCustomBucketsSchema(h.Schema) && !histogram.FloatBucketsMatch(h.CustomValues, a.customValues) {
                return
        }

        positiveInserts, backwardPositiveInserts, positiveSpans = expandSpansBothWays(a.pSpans, h.PositiveSpans)
        negativeInserts, backwardNegativeInserts, negativeSpans = expandSpansBothWays(a.nSpans, h.NegativeSpans)
        okToAppend = true
        return
}

// counterResetInAnyFloatBucket returns true if there was a counter reset for any
// bucket. This should be called only when the bucket layout is the same or new
// buckets were added. It does not handle the case of buckets missing.
func counterResetInAnyFloatBucket(oldBuckets []xorValue, newBuckets []float64, oldSpans, newSpans []histogram.Span) bool {
        if len(oldSpans) == 0 || len(oldBuckets) == 0 {
                return false
        }

        var (
                oldSpanSliceIdx, newSpanSliceIdx     int    = -1, -1 // Index for the span slices. Starts at -1 to indicate that the first non empty span is not yet found.
                oldInsideSpanIdx, newInsideSpanIdx   uint32          // Index inside a span.
                oldIdx, newIdx                       int32           // Index inside a bucket slice.
                oldBucketSliceIdx, newBucketSliceIdx int             // Index inside bucket slice.
        )

        // Find first non empty spans.
        oldSpanSliceIdx, oldIdx = nextNonEmptySpanSliceIdx(oldSpanSliceIdx, oldIdx, oldSpans)
        newSpanSliceIdx, newIdx = nextNonEmptySpanSliceIdx(newSpanSliceIdx, newIdx, newSpans)
        oldVal, newVal := oldBuckets[0].value, newBuckets[0]

        // Since we assume that new spans won't have missing buckets, there will never be a case
        // where the old index will not find a matching new index.
        for {
                if oldIdx == newIdx {
                        if newVal < oldVal {
                                return true
                        }
                }

                if oldIdx <= newIdx {
                        // Moving ahead old bucket and span by 1 index.
                        if oldInsideSpanIdx+1 >= oldSpans[oldSpanSliceIdx].Length {
                                // Current span is over.
                                oldSpanSliceIdx, oldIdx = nextNonEmptySpanSliceIdx(oldSpanSliceIdx, oldIdx, oldSpans)
                                oldInsideSpanIdx = 0
                                if oldSpanSliceIdx >= len(oldSpans) {
                                        // All old spans are over.
                                        break
                                }
                        } else {
                                oldInsideSpanIdx++
                                oldIdx++
                        }
                        oldBucketSliceIdx++
                        oldVal = oldBuckets[oldBucketSliceIdx].value
                }

                if oldIdx > newIdx {
                        // Moving ahead new bucket and span by 1 index.
                        if newInsideSpanIdx+1 >= newSpans[newSpanSliceIdx].Length {
                                // Current span is over.
                                newSpanSliceIdx, newIdx = nextNonEmptySpanSliceIdx(newSpanSliceIdx, newIdx, newSpans)
                                newInsideSpanIdx = 0
                                if newSpanSliceIdx >= len(newSpans) {
                                        // All new spans are over.
                                        // This should not happen, old spans above should catch this first.
                                        panic("new spans over before old spans in counterReset")
                                }
                        } else {
                                newInsideSpanIdx++
                                newIdx++
                        }
                        newBucketSliceIdx++
                        newVal = newBuckets[newBucketSliceIdx]
                }
        }

        return false
}

// appendFloatHistogram appends a float histogram to the chunk. The caller must ensure that
// the histogram is properly structured, e.g. the number of buckets used
// corresponds to the number conveyed by the span structures. First call
// Appendable() and act accordingly!
func (a *FloatHistogramAppender) appendFloatHistogram(t int64, h *histogram.FloatHistogram) {
        var tDelta int64
        num := binary.BigEndian.Uint16(a.b.bytes())

        if value.IsStaleNaN(h.Sum) {
                // Emptying out other fields to write no buckets, and an empty
                // layout in case of first histogram in the chunk.
                h = &histogram.FloatHistogram{Sum: h.Sum}
        }

        if num == 0 {
                // The first append gets the privilege to dictate the layout
                // but it's also responsible for encoding it into the chunk!
                writeHistogramChunkLayout(a.b, h.Schema, h.ZeroThreshold, h.PositiveSpans, h.NegativeSpans, h.CustomValues)
                a.schema = h.Schema
                a.zThreshold = h.ZeroThreshold

                if len(h.PositiveSpans) > 0 {
                        a.pSpans = make([]histogram.Span, len(h.PositiveSpans))
                        copy(a.pSpans, h.PositiveSpans)
                } else {
                        a.pSpans = nil
                }
                if len(h.NegativeSpans) > 0 {
                        a.nSpans = make([]histogram.Span, len(h.NegativeSpans))
                        copy(a.nSpans, h.NegativeSpans)
                } else {
                        a.nSpans = nil
                }
                if len(h.CustomValues) > 0 {
                        a.customValues = make([]float64, len(h.CustomValues))
                        copy(a.customValues, h.CustomValues)
                } else {
                        a.customValues = nil
                }

                numPBuckets, numNBuckets := countSpans(h.PositiveSpans), countSpans(h.NegativeSpans)
                if numPBuckets > 0 {
                        a.pBuckets = make([]xorValue, numPBuckets)
                        for i := 0; i < numPBuckets; i++ {
                                a.pBuckets[i] = xorValue{
                                        value:   h.PositiveBuckets[i],
                                        leading: 0xff,
                                }
                        }
                } else {
                        a.pBuckets = nil
                }
                if numNBuckets > 0 {
                        a.nBuckets = make([]xorValue, numNBuckets)
                        for i := 0; i < numNBuckets; i++ {
                                a.nBuckets[i] = xorValue{
                                        value:   h.NegativeBuckets[i],
                                        leading: 0xff,
                                }
                        }
                } else {
                        a.nBuckets = nil
                }

                // Now store the actual data.
                putVarbitInt(a.b, t)
                a.b.writeBits(math.Float64bits(h.Count), 64)
                a.b.writeBits(math.Float64bits(h.ZeroCount), 64)
                a.b.writeBits(math.Float64bits(h.Sum), 64)
                a.cnt.value = h.Count
                a.zCnt.value = h.ZeroCount
                a.sum.value = h.Sum
                for _, b := range h.PositiveBuckets {
                        a.b.writeBits(math.Float64bits(b), 64)
                }
                for _, b := range h.NegativeBuckets {
                        a.b.writeBits(math.Float64bits(b), 64)
                }
        } else {
                // The case for the 2nd sample with single deltas is implicitly handled correctly with the double delta code,
                // so we don't need a separate single delta logic for the 2nd sample.
                tDelta = t - a.t
                tDod := tDelta - a.tDelta
                putVarbitInt(a.b, tDod)

                a.writeXorValue(&a.cnt, h.Count)
                a.writeXorValue(&a.zCnt, h.ZeroCount)
                a.writeXorValue(&a.sum, h.Sum)

                for i, b := range h.PositiveBuckets {
                        a.writeXorValue(&a.pBuckets[i], b)
                }
                for i, b := range h.NegativeBuckets {
                        a.writeXorValue(&a.nBuckets[i], b)
                }
        }

        binary.BigEndian.PutUint16(a.b.bytes(), num+1)

        a.t = t
        a.tDelta = tDelta
}

func (a *FloatHistogramAppender) writeXorValue(old *xorValue, v float64) {
        xorWrite(a.b, v, old.value, &old.leading, &old.trailing)
        old.value = v
}

// recode converts the current chunk to accommodate an expansion of the set of
// (positive and/or negative) buckets used, according to the provided inserts,
// resulting in the honoring of the provided new positive and negative spans. To
// continue appending, use the returned Appender rather than the receiver of
// this method.
func (a *FloatHistogramAppender) recode(
        positiveInserts, negativeInserts []Insert,
        positiveSpans, negativeSpans []histogram.Span,
) (Chunk, Appender) {
        // TODO(beorn7): This currently just decodes everything and then encodes
        // it again with the new span layout. This can probably be done in-place
        // by editing the chunk. But let's first see how expensive it is in the
        // big picture. Also, in-place editing might create concurrency issues.
        byts := a.b.bytes()
        it := newFloatHistogramIterator(byts)
        hc := NewFloatHistogramChunk()
        app, err := hc.Appender()
        if err != nil {
                panic(err) // This should never happen for an empty float histogram chunk.
        }
        happ := app.(*FloatHistogramAppender)
        numPositiveBuckets, numNegativeBuckets := countSpans(positiveSpans), countSpans(negativeSpans)

        for it.Next() == ValFloatHistogram {
                tOld, hOld := it.AtFloatHistogram(nil)

                // We have to newly allocate slices for the modified buckets
                // here because they are kept by the appender until the next
                // append.
                // TODO(beorn7): We might be able to optimize this.
                var positiveBuckets, negativeBuckets []float64
                if numPositiveBuckets > 0 {
                        positiveBuckets = make([]float64, numPositiveBuckets)
                }
                if numNegativeBuckets > 0 {
                        negativeBuckets = make([]float64, numNegativeBuckets)
                }

                // Save the modified histogram to the new chunk.
                hOld.PositiveSpans, hOld.NegativeSpans = positiveSpans, negativeSpans
                if len(positiveInserts) > 0 {
                        hOld.PositiveBuckets = insert(hOld.PositiveBuckets, positiveBuckets, positiveInserts, false)
                }
                if len(negativeInserts) > 0 {
                        hOld.NegativeBuckets = insert(hOld.NegativeBuckets, negativeBuckets, negativeInserts, false)
                }
                happ.appendFloatHistogram(tOld, hOld)
        }

        happ.setCounterResetHeader(CounterResetHeader(byts[2] & CounterResetHeaderMask))
        return hc, app
}

// recodeHistogram converts the current histogram (in-place) to accommodate an expansion of the set of
// (positive and/or negative) buckets used.
func (a *FloatHistogramAppender) recodeHistogram(
        fh *histogram.FloatHistogram,
        pBackwardInter, nBackwardInter []Insert,
) {
        if len(pBackwardInter) > 0 {
                numPositiveBuckets := countSpans(fh.PositiveSpans)
                fh.PositiveBuckets = insert(fh.PositiveBuckets, make([]float64, numPositiveBuckets), pBackwardInter, false)
        }
        if len(nBackwardInter) > 0 {
                numNegativeBuckets := countSpans(fh.NegativeSpans)
                fh.NegativeBuckets = insert(fh.NegativeBuckets, make([]float64, numNegativeBuckets), nBackwardInter, false)
        }
}

func (a *FloatHistogramAppender) AppendHistogram(*HistogramAppender, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) {
        panic("appended a histogram sample to a float histogram chunk")
}

func (a *FloatHistogramAppender) AppendFloatHistogram(prev *FloatHistogramAppender, t int64, h *histogram.FloatHistogram, appendOnly bool) (Chunk, bool, Appender, error) {
        if a.NumSamples() == 0 {
                a.appendFloatHistogram(t, h)
                if h.CounterResetHint == histogram.GaugeType {
                        a.setCounterResetHeader(GaugeType)
                        return nil, false, a, nil
                }

                switch {
                case h.CounterResetHint == histogram.CounterReset:
                        // Always honor the explicit counter reset hint.
                        a.setCounterResetHeader(CounterReset)
                case prev != nil:
                        // This is a new chunk, but continued from a previous one. We need to calculate the reset header unless already set.
                        _, _, _, counterReset := prev.appendable(h)
                        if counterReset {
                                a.setCounterResetHeader(CounterReset)
                        } else {
                                a.setCounterResetHeader(NotCounterReset)
                        }
                }
                return nil, false, a, nil
        }

        // Adding counter-like histogram.
        if h.CounterResetHint != histogram.GaugeType {
                pForwardInserts, nForwardInserts, okToAppend, counterReset := a.appendable(h)
                if !okToAppend || counterReset {
                        if appendOnly {
                                if counterReset {
                                        return nil, false, a, fmt.Errorf("float histogram counter reset")
                                }
                                return nil, false, a, fmt.Errorf("float histogram schema change")
                        }
                        newChunk := NewFloatHistogramChunk()
                        app, err := newChunk.Appender()
                        if err != nil {
                                panic(err) // This should never happen for an empty float histogram chunk.
                        }
                        happ := app.(*FloatHistogramAppender)
                        if counterReset {
                                happ.setCounterResetHeader(CounterReset)
                        }
                        happ.appendFloatHistogram(t, h)
                        return newChunk, false, app, nil
                }
                if len(pForwardInserts) > 0 || len(nForwardInserts) > 0 {
                        if appendOnly {
                                return nil, false, a, fmt.Errorf("float histogram layout change with %d positive and %d negative forwards inserts", len(pForwardInserts), len(nForwardInserts))
                        }
                        chk, app := a.recode(
                                pForwardInserts, nForwardInserts,
                                h.PositiveSpans, h.NegativeSpans,
                        )
                        app.(*FloatHistogramAppender).appendFloatHistogram(t, h)
                        return chk, true, app, nil
                }
                a.appendFloatHistogram(t, h)
                return nil, false, a, nil
        }
        // Adding gauge histogram.
        pForwardInserts, nForwardInserts, pBackwardInserts, nBackwardInserts, pMergedSpans, nMergedSpans, okToAppend := a.appendableGauge(h)
        if !okToAppend {
                if appendOnly {
                        return nil, false, a, fmt.Errorf("float gauge histogram schema change")
                }
                newChunk := NewFloatHistogramChunk()
                app, err := newChunk.Appender()
                if err != nil {
                        panic(err) // This should never happen for an empty float histogram chunk.
                }
                happ := app.(*FloatHistogramAppender)
                happ.setCounterResetHeader(GaugeType)
                happ.appendFloatHistogram(t, h)
                return newChunk, false, app, nil
        }

        if len(pBackwardInserts)+len(nBackwardInserts) > 0 {
                if appendOnly {
                        return nil, false, a, fmt.Errorf("float gauge histogram layout change with %d positive and %d negative backwards inserts", len(pBackwardInserts), len(nBackwardInserts))
                }
                h.PositiveSpans = pMergedSpans
                h.NegativeSpans = nMergedSpans
                a.recodeHistogram(h, pBackwardInserts, nBackwardInserts)
        }

        if len(pForwardInserts) > 0 || len(nForwardInserts) > 0 {
                if appendOnly {
                        return nil, false, a, fmt.Errorf("float gauge histogram layout change with %d positive and %d negative forwards inserts", len(pForwardInserts), len(nForwardInserts))
                }
                chk, app := a.recode(
                        pForwardInserts, nForwardInserts,
                        h.PositiveSpans, h.NegativeSpans,
                )
                app.(*FloatHistogramAppender).appendFloatHistogram(t, h)
                return chk, true, app, nil
        }

        a.appendFloatHistogram(t, h)
        return nil, false, a, nil
}

type floatHistogramIterator struct {
        br       bstreamReader
        numTotal uint16
        numRead  uint16

        counterResetHeader CounterResetHeader

        // Layout:
        schema         int32
        zThreshold     float64
        pSpans, nSpans []histogram.Span
        customValues   []float64

        // For the fields that are tracked as deltas and ultimately dod's.
        t      int64
        tDelta int64

        // All Gorilla xor encoded.
        sum, cnt, zCnt xorValue

        // Buckets are not of type xorValue to avoid creating
        // new slices for every AtFloatHistogram call.
        pBuckets, nBuckets                 []float64
        pBucketsLeading, nBucketsLeading   []uint8
        pBucketsTrailing, nBucketsTrailing []uint8

        err error

        // Track calls to retrieve methods. Once they have been called, we
        // cannot recycle the bucket slices anymore because we have returned
        // them in the histogram.
        atFloatHistogramCalled bool
}

func (it *floatHistogramIterator) Seek(t int64) ValueType {
        if it.err != nil {
                return ValNone
        }

        for t > it.t || it.numRead == 0 {
                if it.Next() == ValNone {
                        return ValNone
                }
        }
        return ValFloatHistogram
}

func (it *floatHistogramIterator) At() (int64, float64) {
        panic("cannot call floatHistogramIterator.At")
}

func (it *floatHistogramIterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) {
        panic("cannot call floatHistogramIterator.AtHistogram")
}

func (it *floatHistogramIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        if value.IsStaleNaN(it.sum.value) {
                return it.t, &histogram.FloatHistogram{Sum: it.sum.value}
        }
        if fh == nil {
                it.atFloatHistogramCalled = true
                return it.t, &histogram.FloatHistogram{
                        CounterResetHint: counterResetHint(it.counterResetHeader, it.numRead),
                        Count:            it.cnt.value,
                        ZeroCount:        it.zCnt.value,
                        Sum:              it.sum.value,
                        ZeroThreshold:    it.zThreshold,
                        Schema:           it.schema,
                        PositiveSpans:    it.pSpans,
                        NegativeSpans:    it.nSpans,
                        PositiveBuckets:  it.pBuckets,
                        NegativeBuckets:  it.nBuckets,
                        CustomValues:     it.customValues,
                }
        }

        fh.CounterResetHint = counterResetHint(it.counterResetHeader, it.numRead)
        fh.Schema = it.schema
        fh.ZeroThreshold = it.zThreshold
        fh.ZeroCount = it.zCnt.value
        fh.Count = it.cnt.value
        fh.Sum = it.sum.value

        fh.PositiveSpans = resize(fh.PositiveSpans, len(it.pSpans))
        copy(fh.PositiveSpans, it.pSpans)

        fh.NegativeSpans = resize(fh.NegativeSpans, len(it.nSpans))
        copy(fh.NegativeSpans, it.nSpans)

        fh.PositiveBuckets = resize(fh.PositiveBuckets, len(it.pBuckets))
        copy(fh.PositiveBuckets, it.pBuckets)

        fh.NegativeBuckets = resize(fh.NegativeBuckets, len(it.nBuckets))
        copy(fh.NegativeBuckets, it.nBuckets)

        fh.CustomValues = resize(fh.CustomValues, len(it.customValues))
        copy(fh.CustomValues, it.customValues)

        return it.t, fh
}

func (it *floatHistogramIterator) AtT() int64 {
        return it.t
}

func (it *floatHistogramIterator) Err() error {
        return it.err
}

func (it *floatHistogramIterator) Reset(b []byte) {
        // The first 3 bytes contain chunk headers.
        // We skip that for actual samples.
        it.br = newBReader(b[3:])
        it.numTotal = binary.BigEndian.Uint16(b)
        it.numRead = 0

        it.counterResetHeader = CounterResetHeader(b[2] & CounterResetHeaderMask)

        it.t, it.tDelta = 0, 0
        it.cnt, it.zCnt, it.sum = xorValue{}, xorValue{}, xorValue{}

        if it.atFloatHistogramCalled {
                it.atFloatHistogramCalled = false
                it.pBuckets, it.nBuckets = nil, nil
        } else {
                it.pBuckets, it.nBuckets = it.pBuckets[:0], it.nBuckets[:0]
        }
        it.pBucketsLeading, it.pBucketsTrailing = it.pBucketsLeading[:0], it.pBucketsTrailing[:0]
        it.nBucketsLeading, it.nBucketsTrailing = it.nBucketsLeading[:0], it.nBucketsTrailing[:0]

        it.err = nil
}

func (it *floatHistogramIterator) Next() ValueType {
        if it.err != nil || it.numRead == it.numTotal {
                return ValNone
        }

        if it.numRead == 0 {
                // The first read is responsible for reading the chunk layout
                // and for initializing fields that depend on it. We give
                // counter reset info at chunk level, hence we discard it here.
                schema, zeroThreshold, posSpans, negSpans, customValues, err := readHistogramChunkLayout(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.schema = schema
                it.zThreshold = zeroThreshold
                it.pSpans, it.nSpans = posSpans, negSpans
                it.customValues = customValues
                numPBuckets, numNBuckets := countSpans(posSpans), countSpans(negSpans)
                // Allocate bucket slices as needed, recycling existing slices
                // in case this iterator was reset and already has slices of a
                // sufficient capacity.
                if numPBuckets > 0 {
                        it.pBuckets = append(it.pBuckets, make([]float64, numPBuckets)...)
                        it.pBucketsLeading = append(it.pBucketsLeading, make([]uint8, numPBuckets)...)
                        it.pBucketsTrailing = append(it.pBucketsTrailing, make([]uint8, numPBuckets)...)
                }
                if numNBuckets > 0 {
                        it.nBuckets = append(it.nBuckets, make([]float64, numNBuckets)...)
                        it.nBucketsLeading = append(it.nBucketsLeading, make([]uint8, numNBuckets)...)
                        it.nBucketsTrailing = append(it.nBucketsTrailing, make([]uint8, numNBuckets)...)
                }

                // Now read the actual data.
                t, err := readVarbitInt(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.t = t

                cnt, err := it.br.readBits(64)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.cnt.value = math.Float64frombits(cnt)

                zcnt, err := it.br.readBits(64)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.zCnt.value = math.Float64frombits(zcnt)

                sum, err := it.br.readBits(64)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.sum.value = math.Float64frombits(sum)

                for i := range it.pBuckets {
                        v, err := it.br.readBits(64)
                        if err != nil {
                                it.err = err
                                return ValNone
                        }
                        it.pBuckets[i] = math.Float64frombits(v)
                }
                for i := range it.nBuckets {
                        v, err := it.br.readBits(64)
                        if err != nil {
                                it.err = err
                                return ValNone
                        }
                        it.nBuckets[i] = math.Float64frombits(v)
                }

                it.numRead++
                return ValFloatHistogram
        }

        // The case for the 2nd sample with single deltas is implicitly handled correctly with the double delta code,
        // so we don't need a separate single delta logic for the 2nd sample.

        // Recycle bucket slices that have not been returned yet. Otherwise, copy them.
        // We can always recycle the slices for leading and trailing bits as they are
        // never returned to the caller.
        if it.atFloatHistogramCalled {
                it.atFloatHistogramCalled = false
                if len(it.pBuckets) > 0 {
                        newBuckets := make([]float64, len(it.pBuckets))
                        copy(newBuckets, it.pBuckets)
                        it.pBuckets = newBuckets
                } else {
                        it.pBuckets = nil
                }
                if len(it.nBuckets) > 0 {
                        newBuckets := make([]float64, len(it.nBuckets))
                        copy(newBuckets, it.nBuckets)
                        it.nBuckets = newBuckets
                } else {
                        it.nBuckets = nil
                }
        }

        tDod, err := readVarbitInt(&it.br)
        if err != nil {
                it.err = err
                return ValNone
        }
        it.tDelta += tDod
        it.t += it.tDelta

        if ok := it.readXor(&it.cnt.value, &it.cnt.leading, &it.cnt.trailing); !ok {
                return ValNone
        }

        if ok := it.readXor(&it.zCnt.value, &it.zCnt.leading, &it.zCnt.trailing); !ok {
                return ValNone
        }

        if ok := it.readXor(&it.sum.value, &it.sum.leading, &it.sum.trailing); !ok {
                return ValNone
        }

        if value.IsStaleNaN(it.sum.value) {
                it.numRead++
                return ValFloatHistogram
        }

        for i := range it.pBuckets {
                if ok := it.readXor(&it.pBuckets[i], &it.pBucketsLeading[i], &it.pBucketsTrailing[i]); !ok {
                        return ValNone
                }
        }

        for i := range it.nBuckets {
                if ok := it.readXor(&it.nBuckets[i], &it.nBucketsLeading[i], &it.nBucketsTrailing[i]); !ok {
                        return ValNone
                }
        }

        it.numRead++
        return ValFloatHistogram
}

func (it *floatHistogramIterator) readXor(v *float64, leading, trailing *uint8) bool {
        err := xorRead(&it.br, v, leading, trailing)
        if err != nil {
                it.err = err
                return false
        }
        return true
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunkenc

import (
        "encoding/binary"
        "fmt"
        "math"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/value"
)

// HistogramChunk holds encoded sample data for a sparse, high-resolution
// histogram.
//
// Each sample has multiple "fields", stored in the following way (raw = store
// number directly, delta = store delta to the previous number, dod = store
// delta of the delta to the previous number, xor = what we do for regular
// sample values):
//
//        field →    ts    count zeroCount sum []posbuckets []negbuckets
//        sample 1   raw   raw   raw       raw []raw        []raw
//        sample 2   delta delta delta     xor []delta      []delta
//        sample >2  dod   dod   dod       xor []dod        []dod
type HistogramChunk struct {
        b bstream
}

// NewHistogramChunk returns a new chunk with histogram encoding of the given
// size.
func NewHistogramChunk() *HistogramChunk {
        b := make([]byte, 3, 128)
        return &HistogramChunk{b: bstream{stream: b, count: 0}}
}

func (c *HistogramChunk) Reset(stream []byte) {
        c.b.Reset(stream)
}

// Encoding returns the encoding type.
func (c *HistogramChunk) Encoding() Encoding {
        return EncHistogram
}

// Bytes returns the underlying byte slice of the chunk.
func (c *HistogramChunk) Bytes() []byte {
        return c.b.bytes()
}

// NumSamples returns the number of samples in the chunk.
func (c *HistogramChunk) NumSamples() int {
        return int(binary.BigEndian.Uint16(c.Bytes()))
}

// Layout returns the histogram layout. Only call this on chunks that have at
// least one sample.
func (c *HistogramChunk) Layout() (
        schema int32, zeroThreshold float64,
        negativeSpans, positiveSpans []histogram.Span,
        customValues []float64,
        err error,
) {
        if c.NumSamples() == 0 {
                panic("HistogramChunk.Layout() called on an empty chunk")
        }
        b := newBReader(c.Bytes()[2:])
        return readHistogramChunkLayout(&b)
}

// CounterResetHeader defines the first 2 bits of the chunk header.
type CounterResetHeader byte

const (
        // CounterReset means there was definitely a counter reset that resulted in this chunk.
        CounterReset CounterResetHeader = 0b10000000
        // NotCounterReset means there was definitely no counter reset when cutting this chunk.
        NotCounterReset CounterResetHeader = 0b01000000
        // GaugeType means this chunk contains a gauge histogram, where counter resets do not happen.
        GaugeType CounterResetHeader = 0b11000000
        // UnknownCounterReset means we cannot say if this chunk was created due to a counter reset or not.
        // An explicit counter reset detection needs to happen during query time.
        UnknownCounterReset CounterResetHeader = 0b00000000
)

// CounterResetHeaderMask is the mask to get the counter reset header bits.
const CounterResetHeaderMask byte = 0b11000000

// GetCounterResetHeader returns the info about the first 2 bits of the chunk
// header.
func (c *HistogramChunk) GetCounterResetHeader() CounterResetHeader {
        return CounterResetHeader(c.Bytes()[2] & CounterResetHeaderMask)
}

// Compact implements the Chunk interface.
func (c *HistogramChunk) Compact() {
        if l := len(c.b.stream); cap(c.b.stream) > l+chunkCompactCapacityThreshold {
                buf := make([]byte, l)
                copy(buf, c.b.stream)
                c.b.stream = buf
        }
}

// Appender implements the Chunk interface.
func (c *HistogramChunk) Appender() (Appender, error) {
        it := c.iterator(nil)

        // To get an appender, we must know the state it would have if we had
        // appended all existing data from scratch. We iterate through the end
        // and populate via the iterator's state.
        for it.Next() == ValHistogram {
        }
        if err := it.Err(); err != nil {
                return nil, err
        }

        a := &HistogramAppender{
                b: &c.b,

                schema:        it.schema,
                zThreshold:    it.zThreshold,
                pSpans:        it.pSpans,
                nSpans:        it.nSpans,
                customValues:  it.customValues,
                t:             it.t,
                cnt:           it.cnt,
                zCnt:          it.zCnt,
                tDelta:        it.tDelta,
                cntDelta:      it.cntDelta,
                zCntDelta:     it.zCntDelta,
                pBuckets:      it.pBuckets,
                nBuckets:      it.nBuckets,
                pBucketsDelta: it.pBucketsDelta,
                nBucketsDelta: it.nBucketsDelta,

                sum:      it.sum,
                leading:  it.leading,
                trailing: it.trailing,
        }
        if it.numTotal == 0 {
                a.leading = 0xff
        }
        return a, nil
}

func countSpans(spans []histogram.Span) int {
        var cnt int
        for _, s := range spans {
                cnt += int(s.Length)
        }
        return cnt
}

func newHistogramIterator(b []byte) *histogramIterator {
        it := &histogramIterator{
                br:       newBReader(b),
                numTotal: binary.BigEndian.Uint16(b),
                t:        math.MinInt64,
        }
        // The first 3 bytes contain chunk headers.
        // We skip that for actual samples.
        _, _ = it.br.readBits(24)
        it.counterResetHeader = CounterResetHeader(b[2] & CounterResetHeaderMask)
        return it
}

func (c *HistogramChunk) iterator(it Iterator) *histogramIterator {
        // This comment is copied from XORChunk.iterator:
        //   Should iterators guarantee to act on a copy of the data so it doesn't lock append?
        //   When using striped locks to guard access to chunks, probably yes.
        //   Could only copy data if the chunk is not completed yet.
        if histogramIter, ok := it.(*histogramIterator); ok {
                histogramIter.Reset(c.b.bytes())
                return histogramIter
        }
        return newHistogramIterator(c.b.bytes())
}

// Iterator implements the Chunk interface.
func (c *HistogramChunk) Iterator(it Iterator) Iterator {
        return c.iterator(it)
}

// HistogramAppender is an Appender implementation for sparse histograms.
type HistogramAppender struct {
        b *bstream

        // Layout:
        schema         int32
        zThreshold     float64
        pSpans, nSpans []histogram.Span
        customValues   []float64

        // Although we intend to start new chunks on counter resets, we still
        // have to handle negative deltas for gauge histograms. Therefore, even
        // deltas are signed types here (even for tDelta to not treat that one
        // specially).
        t                            int64
        cnt, zCnt                    uint64
        tDelta, cntDelta, zCntDelta  int64
        pBuckets, nBuckets           []int64
        pBucketsDelta, nBucketsDelta []int64

        // The sum is Gorilla xor encoded.
        sum      float64
        leading  uint8
        trailing uint8
}

func (a *HistogramAppender) GetCounterResetHeader() CounterResetHeader {
        return CounterResetHeader(a.b.bytes()[2] & CounterResetHeaderMask)
}

func (a *HistogramAppender) setCounterResetHeader(cr CounterResetHeader) {
        a.b.bytes()[2] = (a.b.bytes()[2] & (^CounterResetHeaderMask)) | (byte(cr) & CounterResetHeaderMask)
}

func (a *HistogramAppender) NumSamples() int {
        return int(binary.BigEndian.Uint16(a.b.bytes()))
}

// Append implements Appender. This implementation panics because normal float
// samples must never be appended to a histogram chunk.
func (a *HistogramAppender) Append(int64, float64) {
        panic("appended a float sample to a histogram chunk")
}

// appendable returns whether the chunk can be appended to, and if so whether
// any recoding needs to happen using the provided inserts (in case of any new
// buckets, positive or negative range, respectively).  If the sample is a gauge
// histogram, AppendableGauge must be used instead.
//
// The chunk is not appendable in the following cases:
//
//   - The schema has changed.
//   - The custom bounds have changed if the current schema is custom buckets.
//   - The threshold for the zero bucket has changed.
//   - Any buckets have disappeared.
//   - There was a counter reset in the count of observations or in any bucket,
//     including the zero bucket.
//   - The last sample in the chunk was stale while the current sample is not stale.
//
// The method returns an additional boolean set to true if it is not appendable
// because of a counter reset. If the given sample is stale, it is always ok to
// append. If counterReset is true, okToAppend is always false.
func (a *HistogramAppender) appendable(h *histogram.Histogram) (
        positiveInserts, negativeInserts []Insert,
        okToAppend, counterReset bool,
) {
        if a.NumSamples() > 0 && a.GetCounterResetHeader() == GaugeType {
                return
        }
        if h.CounterResetHint == histogram.CounterReset {
                // Always honor the explicit counter reset hint.
                counterReset = true
                return
        }
        if value.IsStaleNaN(h.Sum) {
                // This is a stale sample whose buckets and spans don't matter.
                okToAppend = true
                return
        }
        if value.IsStaleNaN(a.sum) {
                // If the last sample was stale, then we can only accept stale
                // samples in this chunk.
                return
        }

        if h.Count < a.cnt {
                // There has been a counter reset.
                counterReset = true
                return
        }

        if h.Schema != a.schema || h.ZeroThreshold != a.zThreshold {
                return
        }

        if histogram.IsCustomBucketsSchema(h.Schema) && !histogram.FloatBucketsMatch(h.CustomValues, a.customValues) {
                counterReset = true
                return
        }

        if h.ZeroCount < a.zCnt {
                // There has been a counter reset since ZeroThreshold didn't change.
                counterReset = true
                return
        }

        var ok bool
        positiveInserts, ok = expandSpansForward(a.pSpans, h.PositiveSpans)
        if !ok {
                counterReset = true
                return
        }
        negativeInserts, ok = expandSpansForward(a.nSpans, h.NegativeSpans)
        if !ok {
                counterReset = true
                return
        }

        if counterResetInAnyBucket(a.pBuckets, h.PositiveBuckets, a.pSpans, h.PositiveSpans) ||
                counterResetInAnyBucket(a.nBuckets, h.NegativeBuckets, a.nSpans, h.NegativeSpans) {
                counterReset, positiveInserts, negativeInserts = true, nil, nil
                return
        }

        okToAppend = true
        return
}

// appendableGauge returns whether the chunk can be appended to, and if so
// whether:
//  1. Any recoding needs to happen to the chunk using the provided inserts
//     (in case of any new buckets, positive or negative range, respectively).
//  2. Any recoding needs to happen for the histogram being appended, using the
//     backward inserts (in case of any missing buckets, positive or negative
//     range, respectively).
//
// This method must be only used for gauge histograms.
//
// The chunk is not appendable in the following cases:
//   - The schema has changed.
//   - The custom bounds have changed if the current schema is custom buckets.
//   - The threshold for the zero bucket has changed.
//   - The last sample in the chunk was stale while the current sample is not stale.
func (a *HistogramAppender) appendableGauge(h *histogram.Histogram) (
        positiveInserts, negativeInserts []Insert,
        backwardPositiveInserts, backwardNegativeInserts []Insert,
        positiveSpans, negativeSpans []histogram.Span,
        okToAppend bool,
) {
        if a.NumSamples() > 0 && a.GetCounterResetHeader() != GaugeType {
                return
        }
        if value.IsStaleNaN(h.Sum) {
                // This is a stale sample whose buckets and spans don't matter.
                okToAppend = true
                return
        }
        if value.IsStaleNaN(a.sum) {
                // If the last sample was stale, then we can only accept stale
                // samples in this chunk.
                return
        }

        if h.Schema != a.schema || h.ZeroThreshold != a.zThreshold {
                return
        }

        if histogram.IsCustomBucketsSchema(h.Schema) && !histogram.FloatBucketsMatch(h.CustomValues, a.customValues) {
                return
        }

        positiveInserts, backwardPositiveInserts, positiveSpans = expandSpansBothWays(a.pSpans, h.PositiveSpans)
        negativeInserts, backwardNegativeInserts, negativeSpans = expandSpansBothWays(a.nSpans, h.NegativeSpans)
        okToAppend = true
        return
}

// counterResetInAnyBucket returns true if there was a counter reset for any
// bucket. This should be called only when the bucket layout is the same or new
// buckets were added. It does not handle the case of buckets missing.
func counterResetInAnyBucket(oldBuckets, newBuckets []int64, oldSpans, newSpans []histogram.Span) bool {
        if len(oldSpans) == 0 || len(oldBuckets) == 0 {
                return false
        }

        var (
                oldSpanSliceIdx, newSpanSliceIdx     int    = -1, -1 // Index for the span slices. Starts at -1 to indicate that the first non empty span is not yet found.
                oldInsideSpanIdx, newInsideSpanIdx   uint32          // Index inside a span.
                oldIdx, newIdx                       int32           // Index inside a bucket slice.
                oldBucketSliceIdx, newBucketSliceIdx int             // Index inside bucket slice.
        )

        // Find first non empty spans.
        oldSpanSliceIdx, oldIdx = nextNonEmptySpanSliceIdx(oldSpanSliceIdx, oldIdx, oldSpans)
        newSpanSliceIdx, newIdx = nextNonEmptySpanSliceIdx(newSpanSliceIdx, newIdx, newSpans)
        oldVal, newVal := oldBuckets[0], newBuckets[0]

        // Since we assume that new spans won't have missing buckets, there will never be a case
        // where the old index will not find a matching new index.
        for {
                if oldIdx == newIdx {
                        if newVal < oldVal {
                                return true
                        }
                }

                if oldIdx <= newIdx {
                        // Moving ahead old bucket and span by 1 index.
                        if oldInsideSpanIdx+1 >= oldSpans[oldSpanSliceIdx].Length {
                                // Current span is over.
                                oldSpanSliceIdx, oldIdx = nextNonEmptySpanSliceIdx(oldSpanSliceIdx, oldIdx, oldSpans)
                                oldInsideSpanIdx = 0
                                if oldSpanSliceIdx >= len(oldSpans) {
                                        // All old spans are over.
                                        break
                                }
                        } else {
                                oldInsideSpanIdx++
                                oldIdx++
                        }
                        oldBucketSliceIdx++
                        oldVal += oldBuckets[oldBucketSliceIdx]
                }

                if oldIdx > newIdx {
                        // Moving ahead new bucket and span by 1 index.
                        if newInsideSpanIdx+1 >= newSpans[newSpanSliceIdx].Length {
                                // Current span is over.
                                newSpanSliceIdx, newIdx = nextNonEmptySpanSliceIdx(newSpanSliceIdx, newIdx, newSpans)
                                newInsideSpanIdx = 0
                                if newSpanSliceIdx >= len(newSpans) {
                                        // All new spans are over.
                                        // This should not happen, old spans above should catch this first.
                                        panic("new spans over before old spans in counterReset")
                                }
                        } else {
                                newInsideSpanIdx++
                                newIdx++
                        }
                        newBucketSliceIdx++
                        newVal += newBuckets[newBucketSliceIdx]
                }
        }

        return false
}

// appendHistogram appends a histogram to the chunk. The caller must ensure that
// the histogram is properly structured, e.g. the number of buckets used
// corresponds to the number conveyed by the span structures. First call
// Appendable() and act accordingly!
func (a *HistogramAppender) appendHistogram(t int64, h *histogram.Histogram) {
        var tDelta, cntDelta, zCntDelta int64
        num := binary.BigEndian.Uint16(a.b.bytes())

        if value.IsStaleNaN(h.Sum) {
                // Emptying out other fields to write no buckets, and an empty
                // layout in case of first histogram in the chunk.
                h = &histogram.Histogram{Sum: h.Sum}
        }

        if num == 0 {
                // The first append gets the privilege to dictate the layout
                // but it's also responsible for encoding it into the chunk!
                writeHistogramChunkLayout(a.b, h.Schema, h.ZeroThreshold, h.PositiveSpans, h.NegativeSpans, h.CustomValues)
                a.schema = h.Schema
                a.zThreshold = h.ZeroThreshold

                if len(h.PositiveSpans) > 0 {
                        a.pSpans = make([]histogram.Span, len(h.PositiveSpans))
                        copy(a.pSpans, h.PositiveSpans)
                } else {
                        a.pSpans = nil
                }
                if len(h.NegativeSpans) > 0 {
                        a.nSpans = make([]histogram.Span, len(h.NegativeSpans))
                        copy(a.nSpans, h.NegativeSpans)
                } else {
                        a.nSpans = nil
                }
                if len(h.CustomValues) > 0 {
                        a.customValues = make([]float64, len(h.CustomValues))
                        copy(a.customValues, h.CustomValues)
                } else {
                        a.customValues = nil
                }

                numPBuckets, numNBuckets := countSpans(h.PositiveSpans), countSpans(h.NegativeSpans)
                if numPBuckets > 0 {
                        a.pBuckets = make([]int64, numPBuckets)
                        a.pBucketsDelta = make([]int64, numPBuckets)
                } else {
                        a.pBuckets = nil
                        a.pBucketsDelta = nil
                }
                if numNBuckets > 0 {
                        a.nBuckets = make([]int64, numNBuckets)
                        a.nBucketsDelta = make([]int64, numNBuckets)
                } else {
                        a.nBuckets = nil
                        a.nBucketsDelta = nil
                }

                // Now store the actual data.
                putVarbitInt(a.b, t)
                putVarbitUint(a.b, h.Count)
                putVarbitUint(a.b, h.ZeroCount)
                a.b.writeBits(math.Float64bits(h.Sum), 64)
                for _, b := range h.PositiveBuckets {
                        putVarbitInt(a.b, b)
                }
                for _, b := range h.NegativeBuckets {
                        putVarbitInt(a.b, b)
                }
        } else {
                // The case for the 2nd sample with single deltas is implicitly
                // handled correctly with the double delta code, so we don't
                // need a separate single delta logic for the 2nd sample.

                tDelta = t - a.t
                cntDelta = int64(h.Count) - int64(a.cnt)
                zCntDelta = int64(h.ZeroCount) - int64(a.zCnt)

                tDod := tDelta - a.tDelta
                cntDod := cntDelta - a.cntDelta
                zCntDod := zCntDelta - a.zCntDelta

                if value.IsStaleNaN(h.Sum) {
                        cntDod, zCntDod = 0, 0
                }

                putVarbitInt(a.b, tDod)
                putVarbitInt(a.b, cntDod)
                putVarbitInt(a.b, zCntDod)

                a.writeSumDelta(h.Sum)

                for i, b := range h.PositiveBuckets {
                        delta := b - a.pBuckets[i]
                        dod := delta - a.pBucketsDelta[i]
                        putVarbitInt(a.b, dod)
                        a.pBucketsDelta[i] = delta
                }
                for i, b := range h.NegativeBuckets {
                        delta := b - a.nBuckets[i]
                        dod := delta - a.nBucketsDelta[i]
                        putVarbitInt(a.b, dod)
                        a.nBucketsDelta[i] = delta
                }
        }

        binary.BigEndian.PutUint16(a.b.bytes(), num+1)

        a.t = t
        a.cnt = h.Count
        a.zCnt = h.ZeroCount
        a.tDelta = tDelta
        a.cntDelta = cntDelta
        a.zCntDelta = zCntDelta

        copy(a.pBuckets, h.PositiveBuckets)
        copy(a.nBuckets, h.NegativeBuckets)
        // Note that the bucket deltas were already updated above.
        a.sum = h.Sum
}

// recode converts the current chunk to accommodate an expansion of the set of
// (positive and/or negative) buckets used, according to the provided inserts,
// resulting in the honoring of the provided new positive and negative spans. To
// continue appending, use the returned Appender rather than the receiver of
// this method.
func (a *HistogramAppender) recode(
        positiveInserts, negativeInserts []Insert,
        positiveSpans, negativeSpans []histogram.Span,
) (Chunk, Appender) {
        // TODO(beorn7): This currently just decodes everything and then encodes
        // it again with the new span layout. This can probably be done in-place
        // by editing the chunk. But let's first see how expensive it is in the
        // big picture. Also, in-place editing might create concurrency issues.
        byts := a.b.bytes()
        it := newHistogramIterator(byts)
        hc := NewHistogramChunk()
        app, err := hc.Appender()
        if err != nil {
                panic(err) // This should never happen for an empty histogram chunk.
        }
        happ := app.(*HistogramAppender)
        numPositiveBuckets, numNegativeBuckets := countSpans(positiveSpans), countSpans(negativeSpans)

        for it.Next() == ValHistogram {
                tOld, hOld := it.AtHistogram(nil)

                // We have to newly allocate slices for the modified buckets
                // here because they are kept by the appender until the next
                // append.
                // TODO(beorn7): We might be able to optimize this.
                var positiveBuckets, negativeBuckets []int64
                if numPositiveBuckets > 0 {
                        positiveBuckets = make([]int64, numPositiveBuckets)
                }
                if numNegativeBuckets > 0 {
                        negativeBuckets = make([]int64, numNegativeBuckets)
                }

                // Save the modified histogram to the new chunk.
                hOld.PositiveSpans, hOld.NegativeSpans = positiveSpans, negativeSpans
                if len(positiveInserts) > 0 {
                        hOld.PositiveBuckets = insert(hOld.PositiveBuckets, positiveBuckets, positiveInserts, true)
                }
                if len(negativeInserts) > 0 {
                        hOld.NegativeBuckets = insert(hOld.NegativeBuckets, negativeBuckets, negativeInserts, true)
                }
                happ.appendHistogram(tOld, hOld)
        }

        happ.setCounterResetHeader(CounterResetHeader(byts[2] & CounterResetHeaderMask))
        return hc, app
}

// recodeHistogram converts the current histogram (in-place) to accommodate an
// expansion of the set of (positive and/or negative) buckets used.
func (a *HistogramAppender) recodeHistogram(
        h *histogram.Histogram,
        pBackwardInserts, nBackwardInserts []Insert,
) {
        if len(pBackwardInserts) > 0 {
                numPositiveBuckets := countSpans(h.PositiveSpans)
                h.PositiveBuckets = insert(h.PositiveBuckets, make([]int64, numPositiveBuckets), pBackwardInserts, true)
        }
        if len(nBackwardInserts) > 0 {
                numNegativeBuckets := countSpans(h.NegativeSpans)
                h.NegativeBuckets = insert(h.NegativeBuckets, make([]int64, numNegativeBuckets), nBackwardInserts, true)
        }
}

func (a *HistogramAppender) writeSumDelta(v float64) {
        xorWrite(a.b, v, a.sum, &a.leading, &a.trailing)
}

func (a *HistogramAppender) AppendFloatHistogram(*FloatHistogramAppender, int64, *histogram.FloatHistogram, bool) (Chunk, bool, Appender, error) {
        panic("appended a float histogram sample to a histogram chunk")
}

func (a *HistogramAppender) AppendHistogram(prev *HistogramAppender, t int64, h *histogram.Histogram, appendOnly bool) (Chunk, bool, Appender, error) {
        if a.NumSamples() == 0 {
                a.appendHistogram(t, h)
                if h.CounterResetHint == histogram.GaugeType {
                        a.setCounterResetHeader(GaugeType)
                        return nil, false, a, nil
                }

                switch {
                case h.CounterResetHint == histogram.CounterReset:
                        // Always honor the explicit counter reset hint.
                        a.setCounterResetHeader(CounterReset)
                case prev != nil:
                        // This is a new chunk, but continued from a previous one. We need to calculate the reset header unless already set.
                        _, _, _, counterReset := prev.appendable(h)
                        if counterReset {
                                a.setCounterResetHeader(CounterReset)
                        } else {
                                a.setCounterResetHeader(NotCounterReset)
                        }
                }
                return nil, false, a, nil
        }

        // Adding counter-like histogram.
        if h.CounterResetHint != histogram.GaugeType {
                pForwardInserts, nForwardInserts, okToAppend, counterReset := a.appendable(h)
                if !okToAppend || counterReset {
                        if appendOnly {
                                if counterReset {
                                        return nil, false, a, fmt.Errorf("histogram counter reset")
                                }
                                return nil, false, a, fmt.Errorf("histogram schema change")
                        }
                        newChunk := NewHistogramChunk()
                        app, err := newChunk.Appender()
                        if err != nil {
                                panic(err) // This should never happen for an empty histogram chunk.
                        }
                        happ := app.(*HistogramAppender)
                        if counterReset {
                                happ.setCounterResetHeader(CounterReset)
                        }
                        happ.appendHistogram(t, h)
                        return newChunk, false, app, nil
                }
                if len(pForwardInserts) > 0 || len(nForwardInserts) > 0 {
                        if appendOnly {
                                return nil, false, a, fmt.Errorf("histogram layout change with %d positive and %d negative forwards inserts", len(pForwardInserts), len(nForwardInserts))
                        }
                        chk, app := a.recode(
                                pForwardInserts, nForwardInserts,
                                h.PositiveSpans, h.NegativeSpans,
                        )
                        app.(*HistogramAppender).appendHistogram(t, h)
                        return chk, true, app, nil
                }
                a.appendHistogram(t, h)
                return nil, false, a, nil
        }
        // Adding gauge histogram.
        pForwardInserts, nForwardInserts, pBackwardInserts, nBackwardInserts, pMergedSpans, nMergedSpans, okToAppend := a.appendableGauge(h)
        if !okToAppend {
                if appendOnly {
                        return nil, false, a, fmt.Errorf("gauge histogram schema change")
                }
                newChunk := NewHistogramChunk()
                app, err := newChunk.Appender()
                if err != nil {
                        panic(err) // This should never happen for an empty histogram chunk.
                }
                happ := app.(*HistogramAppender)
                happ.setCounterResetHeader(GaugeType)
                happ.appendHistogram(t, h)
                return newChunk, false, app, nil
        }

        if len(pBackwardInserts)+len(nBackwardInserts) > 0 {
                if appendOnly {
                        return nil, false, a, fmt.Errorf("gauge histogram layout change with %d positive and %d negative backwards inserts", len(pBackwardInserts), len(nBackwardInserts))
                }
                h.PositiveSpans = pMergedSpans
                h.NegativeSpans = nMergedSpans
                a.recodeHistogram(h, pBackwardInserts, nBackwardInserts)
        }

        if len(pForwardInserts) > 0 || len(nForwardInserts) > 0 {
                if appendOnly {
                        return nil, false, a, fmt.Errorf("gauge histogram layout change with %d positive and %d negative forwards inserts", len(pForwardInserts), len(nForwardInserts))
                }
                chk, app := a.recode(
                        pForwardInserts, nForwardInserts,
                        h.PositiveSpans, h.NegativeSpans,
                )
                app.(*HistogramAppender).appendHistogram(t, h)
                return chk, true, app, nil
        }

        a.appendHistogram(t, h)
        return nil, false, a, nil
}

func CounterResetHintToHeader(hint histogram.CounterResetHint) CounterResetHeader {
        switch hint {
        case histogram.CounterReset:
                return CounterReset
        case histogram.NotCounterReset:
                return NotCounterReset
        case histogram.GaugeType:
                return GaugeType
        default:
                return UnknownCounterReset
        }
}

type histogramIterator struct {
        br       bstreamReader
        numTotal uint16
        numRead  uint16

        counterResetHeader CounterResetHeader

        // Layout:
        schema         int32
        zThreshold     float64
        pSpans, nSpans []histogram.Span
        customValues   []float64

        // For the fields that are tracked as deltas and ultimately dod's.
        t                            int64
        cnt, zCnt                    uint64
        tDelta, cntDelta, zCntDelta  int64
        pBuckets, nBuckets           []int64   // Delta between buckets.
        pFloatBuckets, nFloatBuckets []float64 // Absolute counts.
        pBucketsDelta, nBucketsDelta []int64

        // The sum is Gorilla xor encoded.
        sum      float64
        leading  uint8
        trailing uint8

        // Track calls to retrieve methods. Once they have been called, we
        // cannot recycle the bucket slices anymore because we have returned
        // them in the histogram.
        atHistogramCalled, atFloatHistogramCalled bool

        err error
}

func (it *histogramIterator) Seek(t int64) ValueType {
        if it.err != nil {
                return ValNone
        }

        for t > it.t || it.numRead == 0 {
                if it.Next() == ValNone {
                        return ValNone
                }
        }
        return ValHistogram
}

func (it *histogramIterator) At() (int64, float64) {
        panic("cannot call histogramIterator.At")
}

func (it *histogramIterator) AtHistogram(h *histogram.Histogram) (int64, *histogram.Histogram) {
        if value.IsStaleNaN(it.sum) {
                return it.t, &histogram.Histogram{Sum: it.sum}
        }
        if h == nil {
                it.atHistogramCalled = true
                return it.t, &histogram.Histogram{
                        CounterResetHint: counterResetHint(it.counterResetHeader, it.numRead),
                        Count:            it.cnt,
                        ZeroCount:        it.zCnt,
                        Sum:              it.sum,
                        ZeroThreshold:    it.zThreshold,
                        Schema:           it.schema,
                        PositiveSpans:    it.pSpans,
                        NegativeSpans:    it.nSpans,
                        PositiveBuckets:  it.pBuckets,
                        NegativeBuckets:  it.nBuckets,
                        CustomValues:     it.customValues,
                }
        }

        h.CounterResetHint = counterResetHint(it.counterResetHeader, it.numRead)
        h.Schema = it.schema
        h.ZeroThreshold = it.zThreshold
        h.ZeroCount = it.zCnt
        h.Count = it.cnt
        h.Sum = it.sum

        h.PositiveSpans = resize(h.PositiveSpans, len(it.pSpans))
        copy(h.PositiveSpans, it.pSpans)

        h.NegativeSpans = resize(h.NegativeSpans, len(it.nSpans))
        copy(h.NegativeSpans, it.nSpans)

        h.PositiveBuckets = resize(h.PositiveBuckets, len(it.pBuckets))
        copy(h.PositiveBuckets, it.pBuckets)

        h.NegativeBuckets = resize(h.NegativeBuckets, len(it.nBuckets))
        copy(h.NegativeBuckets, it.nBuckets)

        h.CustomValues = resize(h.CustomValues, len(it.customValues))
        copy(h.CustomValues, it.customValues)

        return it.t, h
}

func (it *histogramIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        if value.IsStaleNaN(it.sum) {
                return it.t, &histogram.FloatHistogram{Sum: it.sum}
        }
        if fh == nil {
                it.atFloatHistogramCalled = true
                return it.t, &histogram.FloatHistogram{
                        CounterResetHint: counterResetHint(it.counterResetHeader, it.numRead),
                        Count:            float64(it.cnt),
                        ZeroCount:        float64(it.zCnt),
                        Sum:              it.sum,
                        ZeroThreshold:    it.zThreshold,
                        Schema:           it.schema,
                        PositiveSpans:    it.pSpans,
                        NegativeSpans:    it.nSpans,
                        PositiveBuckets:  it.pFloatBuckets,
                        NegativeBuckets:  it.nFloatBuckets,
                        CustomValues:     it.customValues,
                }
        }

        fh.CounterResetHint = counterResetHint(it.counterResetHeader, it.numRead)
        fh.Schema = it.schema
        fh.ZeroThreshold = it.zThreshold
        fh.ZeroCount = float64(it.zCnt)
        fh.Count = float64(it.cnt)
        fh.Sum = it.sum

        fh.PositiveSpans = resize(fh.PositiveSpans, len(it.pSpans))
        copy(fh.PositiveSpans, it.pSpans)

        fh.NegativeSpans = resize(fh.NegativeSpans, len(it.nSpans))
        copy(fh.NegativeSpans, it.nSpans)

        fh.PositiveBuckets = resize(fh.PositiveBuckets, len(it.pBuckets))
        var currentPositive float64
        for i, b := range it.pBuckets {
                currentPositive += float64(b)
                fh.PositiveBuckets[i] = currentPositive
        }

        fh.NegativeBuckets = resize(fh.NegativeBuckets, len(it.nBuckets))
        var currentNegative float64
        for i, b := range it.nBuckets {
                currentNegative += float64(b)
                fh.NegativeBuckets[i] = currentNegative
        }

        fh.CustomValues = resize(fh.CustomValues, len(it.customValues))
        copy(fh.CustomValues, it.customValues)

        return it.t, fh
}

func (it *histogramIterator) AtT() int64 {
        return it.t
}

func (it *histogramIterator) Err() error {
        return it.err
}

func (it *histogramIterator) Reset(b []byte) {
        // The first 3 bytes contain chunk headers.
        // We skip that for actual samples.
        it.br = newBReader(b[3:])
        it.numTotal = binary.BigEndian.Uint16(b)
        it.numRead = 0

        it.counterResetHeader = CounterResetHeader(b[2] & CounterResetHeaderMask)

        it.t, it.cnt, it.zCnt = 0, 0, 0
        it.tDelta, it.cntDelta, it.zCntDelta = 0, 0, 0

        // Recycle slices that have not been returned yet. Otherwise, start from
        // scratch.
        if it.atHistogramCalled {
                it.atHistogramCalled = false
                it.pBuckets, it.nBuckets = nil, nil
        } else {
                it.pBuckets = it.pBuckets[:0]
                it.nBuckets = it.nBuckets[:0]
        }
        if it.atFloatHistogramCalled {
                it.atFloatHistogramCalled = false
                it.pFloatBuckets, it.nFloatBuckets = nil, nil
        } else {
                it.pFloatBuckets = it.pFloatBuckets[:0]
                it.nFloatBuckets = it.nFloatBuckets[:0]
        }

        it.pBucketsDelta = it.pBucketsDelta[:0]
        it.nBucketsDelta = it.nBucketsDelta[:0]

        it.sum = 0
        it.leading = 0
        it.trailing = 0
        it.err = nil
}

func (it *histogramIterator) Next() ValueType {
        if it.err != nil || it.numRead == it.numTotal {
                return ValNone
        }

        if it.numRead == 0 {
                // The first read is responsible for reading the chunk layout
                // and for initializing fields that depend on it. We give
                // counter reset info at chunk level, hence we discard it here.
                schema, zeroThreshold, posSpans, negSpans, customValues, err := readHistogramChunkLayout(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.schema = schema
                it.zThreshold = zeroThreshold
                it.pSpans, it.nSpans = posSpans, negSpans
                it.customValues = customValues
                numPBuckets, numNBuckets := countSpans(posSpans), countSpans(negSpans)
                // The code below recycles existing slices in case this iterator
                // was reset and already has slices of a sufficient capacity.
                if numPBuckets > 0 {
                        it.pBuckets = append(it.pBuckets, make([]int64, numPBuckets)...)
                        it.pBucketsDelta = append(it.pBucketsDelta, make([]int64, numPBuckets)...)
                        it.pFloatBuckets = append(it.pFloatBuckets, make([]float64, numPBuckets)...)
                }
                if numNBuckets > 0 {
                        it.nBuckets = append(it.nBuckets, make([]int64, numNBuckets)...)
                        it.nBucketsDelta = append(it.nBucketsDelta, make([]int64, numNBuckets)...)
                        it.nFloatBuckets = append(it.nFloatBuckets, make([]float64, numNBuckets)...)
                }

                // Now read the actual data.
                t, err := readVarbitInt(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.t = t

                cnt, err := readVarbitUint(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.cnt = cnt

                zcnt, err := readVarbitUint(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.zCnt = zcnt

                sum, err := it.br.readBits(64)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.sum = math.Float64frombits(sum)

                var current int64
                for i := range it.pBuckets {
                        v, err := readVarbitInt(&it.br)
                        if err != nil {
                                it.err = err
                                return ValNone
                        }
                        it.pBuckets[i] = v
                        current += it.pBuckets[i]
                        it.pFloatBuckets[i] = float64(current)
                }
                current = 0
                for i := range it.nBuckets {
                        v, err := readVarbitInt(&it.br)
                        if err != nil {
                                it.err = err
                                return ValNone
                        }
                        it.nBuckets[i] = v
                        current += it.nBuckets[i]
                        it.nFloatBuckets[i] = float64(current)
                }

                it.numRead++
                return ValHistogram
        }

        // The case for the 2nd sample with single deltas is implicitly handled correctly with the double delta code,
        // so we don't need a separate single delta logic for the 2nd sample.

        // Recycle bucket slices that have not been returned yet. Otherwise,
        // copy them.
        if it.atHistogramCalled {
                it.atHistogramCalled = false
                if len(it.pBuckets) > 0 {
                        newBuckets := make([]int64, len(it.pBuckets))
                        copy(newBuckets, it.pBuckets)
                        it.pBuckets = newBuckets
                } else {
                        it.pBuckets = nil
                }
                if len(it.nBuckets) > 0 {
                        newBuckets := make([]int64, len(it.nBuckets))
                        copy(newBuckets, it.nBuckets)
                        it.nBuckets = newBuckets
                } else {
                        it.nBuckets = nil
                }
        }
        // FloatBuckets are set from scratch, so simply create empty ones.
        if it.atFloatHistogramCalled {
                it.atFloatHistogramCalled = false
                if len(it.pFloatBuckets) > 0 {
                        it.pFloatBuckets = make([]float64, len(it.pFloatBuckets))
                } else {
                        it.pFloatBuckets = nil
                }
                if len(it.nFloatBuckets) > 0 {
                        it.nFloatBuckets = make([]float64, len(it.nFloatBuckets))
                } else {
                        it.nFloatBuckets = nil
                }
        }

        tDod, err := readVarbitInt(&it.br)
        if err != nil {
                it.err = err
                return ValNone
        }
        it.tDelta += tDod
        it.t += it.tDelta

        cntDod, err := readVarbitInt(&it.br)
        if err != nil {
                it.err = err
                return ValNone
        }
        it.cntDelta += cntDod
        it.cnt = uint64(int64(it.cnt) + it.cntDelta)

        zcntDod, err := readVarbitInt(&it.br)
        if err != nil {
                it.err = err
                return ValNone
        }
        it.zCntDelta += zcntDod
        it.zCnt = uint64(int64(it.zCnt) + it.zCntDelta)

        ok := it.readSum()
        if !ok {
                return ValNone
        }

        if value.IsStaleNaN(it.sum) {
                it.numRead++
                return ValHistogram
        }

        var current int64
        for i := range it.pBuckets {
                dod, err := readVarbitInt(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.pBucketsDelta[i] += dod
                it.pBuckets[i] += it.pBucketsDelta[i]
                current += it.pBuckets[i]
                it.pFloatBuckets[i] = float64(current)
        }

        current = 0
        for i := range it.nBuckets {
                dod, err := readVarbitInt(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.nBucketsDelta[i] += dod
                it.nBuckets[i] += it.nBucketsDelta[i]
                current += it.nBuckets[i]
                it.nFloatBuckets[i] = float64(current)
        }

        it.numRead++
        return ValHistogram
}

func (it *histogramIterator) readSum() bool {
        err := xorRead(&it.br, &it.sum, &it.leading, &it.trailing)
        if err != nil {
                it.err = err
                return false
        }
        return true
}

func resize[T any](items []T, n int) []T {
        if cap(items) < n {
                return make([]T, n)
        }
        return items[:n]
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunkenc

import (
        "math"

        "github.com/prometheus/prometheus/model/histogram"
)

func writeHistogramChunkLayout(
        b *bstream, schema int32, zeroThreshold float64,
        positiveSpans, negativeSpans []histogram.Span, customValues []float64,
) {
        putZeroThreshold(b, zeroThreshold)
        putVarbitInt(b, int64(schema))
        putHistogramChunkLayoutSpans(b, positiveSpans)
        putHistogramChunkLayoutSpans(b, negativeSpans)
        if histogram.IsCustomBucketsSchema(schema) {
                putHistogramChunkLayoutCustomBounds(b, customValues)
        }
}

func readHistogramChunkLayout(b *bstreamReader) (
        schema int32, zeroThreshold float64,
        positiveSpans, negativeSpans []histogram.Span,
        customValues []float64,
        err error,
) {
        zeroThreshold, err = readZeroThreshold(b)
        if err != nil {
                return
        }

        v, err := readVarbitInt(b)
        if err != nil {
                return
        }
        schema = int32(v)

        positiveSpans, err = readHistogramChunkLayoutSpans(b)
        if err != nil {
                return
        }

        negativeSpans, err = readHistogramChunkLayoutSpans(b)
        if err != nil {
                return
        }

        if histogram.IsCustomBucketsSchema(schema) {
                customValues, err = readHistogramChunkLayoutCustomBounds(b)
                if err != nil {
                        return
                }
        }

        return
}

func putHistogramChunkLayoutSpans(b *bstream, spans []histogram.Span) {
        putVarbitUint(b, uint64(len(spans)))
        for _, s := range spans {
                putVarbitUint(b, uint64(s.Length))
                putVarbitInt(b, int64(s.Offset))
        }
}

func readHistogramChunkLayoutSpans(b *bstreamReader) ([]histogram.Span, error) {
        var spans []histogram.Span
        num, err := readVarbitUint(b)
        if err != nil {
                return nil, err
        }
        for i := 0; i < int(num); i++ {
                length, err := readVarbitUint(b)
                if err != nil {
                        return nil, err
                }

                offset, err := readVarbitInt(b)
                if err != nil {
                        return nil, err
                }

                spans = append(spans, histogram.Span{
                        Length: uint32(length),
                        Offset: int32(offset),
                })
        }
        return spans, nil
}

func putHistogramChunkLayoutCustomBounds(b *bstream, customValues []float64) {
        putVarbitUint(b, uint64(len(customValues)))
        for _, bound := range customValues {
                putCustomBound(b, bound)
        }
}

func readHistogramChunkLayoutCustomBounds(b *bstreamReader) ([]float64, error) {
        var customValues []float64
        num, err := readVarbitUint(b)
        if err != nil {
                return nil, err
        }
        for i := 0; i < int(num); i++ {
                bound, err := readCustomBound(b)
                if err != nil {
                        return nil, err
                }

                customValues = append(customValues, bound)
        }
        return customValues, nil
}

// putZeroThreshold writes the zero threshold to the bstream. It stores typical
// values in just one byte, but needs 9 bytes for other values. In detail:
//   - If the threshold is 0, store a single zero byte.
//   - If the threshold is a power of 2 between (and including) 2^-243 and 2^10,
//     take the exponent from the IEEE 754 representation of the threshold, which
//     covers a range between (and including) -242 and 11. (2^-243 is 0.5*2^-242
//     in IEEE 754 representation, and 2^10 is 0.5*2^11.) Add 243 to the exponent
//     and store the result (which will be between 1 and 254) as a single
//     byte. Note that small powers of two are preferred values for the zero
//     threshold. The default value for the zero threshold is 2^-128 (or
//     0.5*2^-127 in IEEE 754 representation) and will therefore be encoded as a
//     single byte (with value 116).
//   - In all other cases, store 255 as a single byte, followed by the 8 bytes of
//     the threshold as a float64, i.e. taking 9 bytes in total.
func putZeroThreshold(b *bstream, threshold float64) {
        if threshold == 0 {
                b.writeByte(0)
                return
        }
        frac, exp := math.Frexp(threshold)
        if frac != 0.5 || exp < -242 || exp > 11 {
                b.writeByte(255)
                b.writeBits(math.Float64bits(threshold), 64)
                return
        }
        b.writeByte(byte(exp + 243))
}

// readZeroThreshold reads the zero threshold written with putZeroThreshold.
func readZeroThreshold(br *bstreamReader) (float64, error) {
        b, err := br.ReadByte()
        if err != nil {
                return 0, err
        }
        switch b {
        case 0:
                return 0, nil
        case 255:
                v, err := br.readBits(64)
                if err != nil {
                        return 0, err
                }
                return math.Float64frombits(v), nil
        default:
                return math.Ldexp(0.5, int(b)-243), nil
        }
}

// isWholeWhenMultiplied checks to see if the number when multiplied by 1000 can
// be converted into an integer without losing precision.
func isWholeWhenMultiplied(in float64) bool {
        i := uint(math.Round(in * 1000))
        out := float64(i) / 1000
        return in == out
}

// putCustomBound writes a custom bound to the bstream. It stores values from
// 0 to 33554.430 (inclusive) that are multiples of 0.001 in unsigned varbit
// encoding of up to 4 bytes, but needs 1 bit + 8 bytes for other values like
// negative numbers, numbers greater than 33554.430, or numbers that are not
// a multiple of 0.001, on the assumption that they are less common. In detail:
//   - Multiply the bound by 1000, without rounding.
//   - If the multiplied bound is >= 0, <= 33554430 and a whole number,
//     add 1 and store it in unsigned varbit encoding. All these numbers are
//     greater than 0, so the leading bit of the varbit is always 1!
//   - Otherwise, store a 0 bit, followed by the 8 bytes of the original
//     bound as a float64.
//
// When reading the values, we can first decode a value as unsigned varbit,
// if it's 0, then we read the next 8 bytes as a float64, otherwise
// we can convert the value to a float64 by subtracting 1 and dividing by 1000.
func putCustomBound(b *bstream, f float64) {
        tf := f * 1000
        // 33554431-1 comes from the maximum that can be stored in a varbit in 4
        // bytes, other values are stored in 8 bytes anyway.
        if tf < 0 || tf > 33554430 || !isWholeWhenMultiplied(f) {
                b.writeBit(zero)
                b.writeBits(math.Float64bits(f), 64)
                return
        }
        putVarbitUint(b, uint64(math.Round(tf))+1)
}

// readCustomBound reads the custom bound written with putCustomBound.
func readCustomBound(br *bstreamReader) (float64, error) {
        b, err := readVarbitUint(br)
        if err != nil {
                return 0, err
        }
        switch b {
        case 0:
                v, err := br.readBits(64)
                if err != nil {
                        return 0, err
                }
                return math.Float64frombits(v), nil
        default:
                return float64(b-1) / 1000, nil
        }
}

type bucketIterator struct {
        spans  []histogram.Span
        span   int // Span position of last yielded bucket.
        bucket int // Bucket position within span of last yielded bucket.
        idx    int // Bucket index (globally across all spans) of last yielded bucket.
}

func newBucketIterator(spans []histogram.Span) *bucketIterator {
        b := bucketIterator{
                spans:  spans,
                span:   0,
                bucket: -1,
                idx:    -1,
        }
        if len(spans) > 0 {
                b.idx += int(spans[0].Offset)
        }
        return &b
}

func (b *bucketIterator) Next() (int, bool) {
        // We're already out of bounds.
        if b.span >= len(b.spans) {
                return 0, false
        }
        if b.bucket < int(b.spans[b.span].Length)-1 { // Try to move within same span.
                b.bucket++
                b.idx++
                return b.idx, true
        }

        for b.span < len(b.spans)-1 { // Try to move from one span to the next.
                b.span++
                b.idx += int(b.spans[b.span].Offset + 1)
                b.bucket = 0
                if b.spans[b.span].Length == 0 {
                        b.idx--
                        continue
                }
                return b.idx, true
        }

        // We're out of options.
        return 0, false
}

// An Insert describes how many new buckets have to be inserted before
// processing the pos'th bucket from the original slice.
type Insert struct {
        pos int
        num int
}

// expandSpansForward returns the inserts to expand the bucket spans 'a' so that
// they match the spans in 'b'. 'b' must cover the same or more buckets than
// 'a', otherwise the function will return false.
//
// Example:
//
// Let's say the old buckets look like this:
//
//        span syntax: [offset, length]
//        spans      : [ 0 , 2 ]               [2,1]                   [ 3 , 2 ]                     [3,1]       [1,1]
//        bucket idx : [0]   [1]    2     3    [4]    5     6     7    [8]   [9]    10    11    12   [13]   14   [15]
//        raw values    6     3                 3                       2     4                       5           1
//        deltas        6    -3                 0                      -1     2                       1          -4
//
// But now we introduce a new bucket layout. (Carefully chosen example where we
// have a span appended, one unchanged[*], one prepended, and two merge - in
// that order.)
//
// [*] unchanged in terms of which bucket indices they represent. but to achieve
// that, their offset needs to change if "disrupted" by spans changing ahead of
// them
//
//                                              \/ this one is "unchanged"
//        spans      : [  0  ,  3    ]         [1,1]       [    1    ,   4     ]                     [  3  ,   3    ]
//        bucket idx : [0]   [1]   [2]    3    [4]    5    [6]   [7]   [8]   [9]    10    11    12   [13]  [14]  [15]
//        raw values    6     3     0           3           0     0     2     4                       5     0     1
//        deltas        6    -3    -3           3          -3     0     2     2                       1    -5     1
//        delta mods:                          / \                     / \                                       / \
//
// Note for histograms with delta-encoded buckets: Whenever any new buckets are
// introduced, the subsequent "old" bucket needs to readjust its delta to the
// new base of 0. Thus, for the caller who wants to transform the set of
// original deltas to a new set of deltas to match a new span layout that adds
// buckets, we simply need to generate a list of inserts.
//
// Note: Within expandSpansForward we don't have to worry about the changes to the
// spans themselves, thanks to the iterators we get to work with the more useful
// bucket indices (which of course directly correspond to the buckets we have to
// adjust).
func expandSpansForward(a, b []histogram.Span) (forward []Insert, ok bool) {
        ai := newBucketIterator(a)
        bi := newBucketIterator(b)

        var inserts []Insert

        // When inter.num becomes > 0, this becomes a valid insert that should
        // be yielded when we finish a streak of new buckets.
        var inter Insert

        av, aOK := ai.Next()
        bv, bOK := bi.Next()
loop:
        for {
                switch {
                case aOK && bOK:
                        switch {
                        case av == bv: // Both have an identical value. move on!
                                // Finish WIP insert and reset.
                                if inter.num > 0 {
                                        inserts = append(inserts, inter)
                                }
                                inter.num = 0
                                av, aOK = ai.Next()
                                bv, bOK = bi.Next()
                                inter.pos++
                        case av < bv: // b misses a value that is in a.
                                return inserts, false
                        case av > bv: // a misses a value that is in b. Forward b and recompare.
                                inter.num++
                                bv, bOK = bi.Next()
                        }
                case aOK && !bOK: // b misses a value that is in a.
                        return inserts, false
                case !aOK && bOK: // a misses a value that is in b. Forward b and recompare.
                        inter.num++
                        bv, bOK = bi.Next()
                default: // Both iterators ran out. We're done.
                        if inter.num > 0 {
                                inserts = append(inserts, inter)
                        }
                        break loop
                }
        }

        return inserts, true
}

// expandSpansBothWays is similar to expandSpansForward, but now b may also
// cover an entirely different set of buckets. The function returns the
// “forward” inserts to expand 'a' to also cover all the buckets exclusively
// covered by 'b', and it returns the “backward” inserts to expand 'b' to also
// cover all the buckets exclusively covered by 'a'.
func expandSpansBothWays(a, b []histogram.Span) (forward, backward []Insert, mergedSpans []histogram.Span) {
        ai := newBucketIterator(a)
        bi := newBucketIterator(b)

        var fInserts, bInserts []Insert
        var lastBucket int
        addBucket := func(b int) {
                offset := b - lastBucket - 1
                if offset == 0 && len(mergedSpans) > 0 {
                        mergedSpans[len(mergedSpans)-1].Length++
                } else {
                        if len(mergedSpans) == 0 {
                                offset++
                        }
                        mergedSpans = append(mergedSpans, histogram.Span{
                                Offset: int32(offset),
                                Length: 1,
                        })
                }

                lastBucket = b
        }

        // When fInter.num (or bInter.num, respectively) becomes > 0, this
        // becomes a valid insert that should be yielded when we finish a streak
        // of new buckets.
        var fInter, bInter Insert

        av, aOK := ai.Next()
        bv, bOK := bi.Next()
loop:
        for {
                switch {
                case aOK && bOK:
                        switch {
                        case av == bv: // Both have an identical value. move on!
                                // Finish WIP insert and reset.
                                if fInter.num > 0 {
                                        fInserts = append(fInserts, fInter)
                                        fInter.num = 0
                                }
                                if bInter.num > 0 {
                                        bInserts = append(bInserts, bInter)
                                        bInter.num = 0
                                }
                                addBucket(av)
                                av, aOK = ai.Next()
                                bv, bOK = bi.Next()
                                fInter.pos++
                                bInter.pos++
                        case av < bv: // b misses a value that is in a.
                                bInter.num++
                                // Collect the forward inserts before advancing
                                // the position of 'a'.
                                if fInter.num > 0 {
                                        fInserts = append(fInserts, fInter)
                                        fInter.num = 0
                                }
                                addBucket(av)
                                fInter.pos++
                                av, aOK = ai.Next()
                        case av > bv: // a misses a value that is in b. Forward b and recompare.
                                fInter.num++
                                // Collect the backward inserts before advancing the
                                // position of 'b'.
                                if bInter.num > 0 {
                                        bInserts = append(bInserts, bInter)
                                        bInter.num = 0
                                }
                                addBucket(bv)
                                bInter.pos++
                                bv, bOK = bi.Next()
                        }
                case aOK && !bOK: // b misses a value that is in a.
                        bInter.num++
                        addBucket(av)
                        av, aOK = ai.Next()
                case !aOK && bOK: // a misses a value that is in b. Forward b and recompare.
                        fInter.num++
                        addBucket(bv)
                        bv, bOK = bi.Next()
                default: // Both iterators ran out. We're done.
                        if fInter.num > 0 {
                                fInserts = append(fInserts, fInter)
                        }
                        if bInter.num > 0 {
                                bInserts = append(bInserts, bInter)
                        }
                        break loop
                }
        }

        return fInserts, bInserts, mergedSpans
}

type bucketValue interface {
        int64 | float64
}

// insert merges 'in' with the provided inserts and writes them into 'out',
// which must already have the appropriate length. 'out' is also returned for
// convenience.
func insert[BV bucketValue](in, out []BV, inserts []Insert, deltas bool) []BV {
        var (
                oi int // Position in out.
                v  BV  // The last value seen.
                ii int // The next insert to process.
        )
        for i, d := range in {
                if ii < len(inserts) && i == inserts[ii].pos {
                        // We have an insert!
                        // Add insert.num new delta values such that their
                        // bucket values equate 0. When deltas==false, it means
                        // that it is an absolute value. So we set it to 0
                        // directly.
                        if deltas {
                                out[oi] = -v
                        } else {
                                out[oi] = 0
                        }
                        oi++
                        for x := 1; x < inserts[ii].num; x++ {
                                out[oi] = 0
                                oi++
                        }
                        ii++

                        // Now save the value from the input. The delta value we
                        // should save is the original delta value + the last
                        // value of the point before the insert (to undo the
                        // delta that was introduced by the insert). When
                        // deltas==false, it means that it is an absolute value,
                        // so we set it directly to the value in the 'in' slice.
                        if deltas {
                                out[oi] = d + v
                        } else {
                                out[oi] = d
                        }
                        oi++
                        v = d + v
                        continue
                }
                // If there was no insert, the original delta is still valid.
                out[oi] = d
                oi++
                v += d
        }
        switch ii {
        case len(inserts):
                // All inserts processed. Nothing more to do.
        case len(inserts) - 1:
                // One more insert to process at the end.
                if deltas {
                        out[oi] = -v
                } else {
                        out[oi] = 0
                }
                oi++
                for x := 1; x < inserts[ii].num; x++ {
                        out[oi] = 0
                        oi++
                }
        default:
                panic("unprocessed inserts left")
        }
        return out
}

// counterResetHint returns a CounterResetHint based on the CounterResetHeader
// and on the position into the chunk.
func counterResetHint(crh CounterResetHeader, numRead uint16) histogram.CounterResetHint {
        switch {
        case crh == GaugeType:
                // A gauge histogram chunk only contains gauge histograms.
                return histogram.GaugeType
        case numRead > 1:
                // In a counter histogram chunk, there will not be any counter
                // resets after the first histogram.
                return histogram.NotCounterReset
        case crh == CounterReset:
                // If the chunk was started because of a counter reset, we can
                // safely return that hint. This histogram always has to be
                // treated as a counter reset.
                return histogram.CounterReset
        default:
                // Sadly, we have to return "unknown" as the hint for all other
                // cases, even if we know that the chunk was started without a
                // counter reset. But we cannot be sure that the previous chunk
                // still exists in the TSDB, so we conservatively return
                // "unknown". On the bright side, this case should be relatively
                // rare.
                //
                // TODO(beorn7): Nevertheless, if the current chunk is in the
                // middle of a block (not the first chunk in the block for this
                // series), it's probably safe to assume that the previous chunk
                // will exist in the TSDB for as long as the current chunk
                // exist, and we could safely return
                // "histogram.NotCounterReset". This needs some more work and
                // might not be worth the effort and/or risk. To be vetted...
                return histogram.UnknownCounterReset
        }
}

// Handle pathological case of empty span when advancing span idx.
// Call it with idx==-1 to find the first non empty span.
func nextNonEmptySpanSliceIdx(idx int, bucketIdx int32, spans []histogram.Span) (newIdx int, newBucketIdx int32) {
        for idx++; idx < len(spans); idx++ {
                if spans[idx].Length > 0 {
                        return idx, bucketIdx + spans[idx].Offset + 1
                }
                bucketIdx += spans[idx].Offset
        }
        return idx, 0
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunkenc

import (
        "fmt"
        "math/bits"
)

// putVarbitInt writes an int64 using varbit encoding with a bit bucketing
// optimized for the dod's observed in histogram buckets, plus a few additional
// buckets for large numbers.
//
// For optimal space utilization, each branch didn't need to support any values
// of any of the prior branches. So we could expand the range of each branch. Do
// more with fewer bits. It would come at the price of more expensive encoding
// and decoding (cutting out and later adding back that center-piece we
// skip). With the distributions of values we see in practice, we would reduce
// the size by around 1%. A more detailed study would be needed for precise
// values, but it's appears quite certain that we would end up far below 10%,
// which would maybe convince us to invest the increased coding/decoding cost.
func putVarbitInt(b *bstream, val int64) {
        switch {
        case val == 0: // Precisely 0, needs 1 bit.
                b.writeBit(zero)
        case bitRange(val, 3): // -3 <= val <= 4, needs 5 bits.
                b.writeBits(0b10, 2)
                b.writeBits(uint64(val), 3)
        case bitRange(val, 6): // -31 <= val <= 32, 9 bits.
                b.writeBits(0b110, 3)
                b.writeBits(uint64(val), 6)
        case bitRange(val, 9): // -255 <= val <= 256, 13 bits.
                b.writeBits(0b1110, 4)
                b.writeBits(uint64(val), 9)
        case bitRange(val, 12): // -2047 <= val <= 2048, 17 bits.
                b.writeBits(0b11110, 5)
                b.writeBits(uint64(val), 12)
        case bitRange(val, 18): // -131071 <= val <= 131072, 3 bytes.
                b.writeBits(0b111110, 6)
                b.writeBits(uint64(val), 18)
        case bitRange(val, 25): // -16777215 <= val <= 16777216, 4 bytes.
                b.writeBits(0b1111110, 7)
                b.writeBits(uint64(val), 25)
        case bitRange(val, 56): // -36028797018963967 <= val <= 36028797018963968, 8 bytes.
                b.writeBits(0b11111110, 8)
                b.writeBits(uint64(val), 56)
        default:
                b.writeBits(0b11111111, 8) // Worst case, needs 9 bytes.
                b.writeBits(uint64(val), 64)
        }
}

// readVarbitInt reads an int64 encoded with putVarbitInt.
func readVarbitInt(b *bstreamReader) (int64, error) {
        var d byte
        for i := 0; i < 8; i++ {
                d <<= 1
                bit, err := b.readBitFast()
                if err != nil {
                        bit, err = b.readBit()
                }
                if err != nil {
                        return 0, err
                }
                if bit == zero {
                        break
                }
                d |= 1
        }

        var val int64
        var sz uint8

        switch d {
        case 0b0:
                // val == 0
        case 0b10:
                sz = 3
        case 0b110:
                sz = 6
        case 0b1110:
                sz = 9
        case 0b11110:
                sz = 12
        case 0b111110:
                sz = 18
        case 0b1111110:
                sz = 25
        case 0b11111110:
                sz = 56
        case 0b11111111:
                // Do not use fast because it's very unlikely it will succeed.
                bits, err := b.readBits(64)
                if err != nil {
                        return 0, err
                }

                val = int64(bits)
        default:
                return 0, fmt.Errorf("invalid bit pattern %b", d)
        }

        if sz != 0 {
                bits, err := b.readBitsFast(sz)
                if err != nil {
                        bits, err = b.readBits(sz)
                }
                if err != nil {
                        return 0, err
                }
                if bits > (1 << (sz - 1)) {
                        // Or something.
                        bits -= (1 << sz)
                }
                val = int64(bits)
        }

        return val, nil
}

func bitRangeUint(x uint64, nbits int) bool {
        return bits.LeadingZeros64(x) >= 64-nbits
}

// putVarbitUint writes a uint64 using varbit encoding. It uses the same bit
// buckets as putVarbitInt.
func putVarbitUint(b *bstream, val uint64) {
        switch {
        case val == 0: // Precisely 0, needs 1 bit.
                b.writeBit(zero)
        case bitRangeUint(val, 3): // val <= 7, needs 5 bits.
                b.writeBits(0b10, 2)
                b.writeBits(val, 3)
        case bitRangeUint(val, 6): // val <= 63, 9 bits.
                b.writeBits(0b110, 3)
                b.writeBits(val, 6)
        case bitRangeUint(val, 9): // val <= 511, 13 bits.
                b.writeBits(0b1110, 4)
                b.writeBits(val, 9)
        case bitRangeUint(val, 12): // val <= 4095, 17 bits.
                b.writeBits(0b11110, 5)
                b.writeBits(val, 12)
        case bitRangeUint(val, 18): // val <= 262143, 3 bytes.
                b.writeBits(0b111110, 6)
                b.writeBits(val, 18)
        case bitRangeUint(val, 25): // val <= 33554431, 4 bytes.
                b.writeBits(0b1111110, 7)
                b.writeBits(val, 25)
        case bitRangeUint(val, 56): // val <= 72057594037927935, 8 bytes.
                b.writeBits(0b11111110, 8)
                b.writeBits(val, 56)
        default:
                b.writeBits(0b11111111, 8) // Worst case, needs 9 bytes.
                b.writeBits(val, 64)
        }
}

// readVarbitUint reads a uint64 encoded with putVarbitUint.
func readVarbitUint(b *bstreamReader) (uint64, error) {
        var d byte
        for i := 0; i < 8; i++ {
                d <<= 1
                bit, err := b.readBitFast()
                if err != nil {
                        bit, err = b.readBit()
                }
                if err != nil {
                        return 0, err
                }
                if bit == zero {
                        break
                }
                d |= 1
        }

        var (
                bits uint64
                sz   uint8
                err  error
        )

        switch d {
        case 0b0:
                // val == 0
        case 0b10:
                sz = 3
        case 0b110:
                sz = 6
        case 0b1110:
                sz = 9
        case 0b11110:
                sz = 12
        case 0b111110:
                sz = 18
        case 0b1111110:
                sz = 25
        case 0b11111110:
                sz = 56
        case 0b11111111:
                // Do not use fast because it's very unlikely it will succeed.
                bits, err = b.readBits(64)
                if err != nil {
                        return 0, err
                }
        default:
                return 0, fmt.Errorf("invalid bit pattern %b", d)
        }

        if sz != 0 {
                bits, err = b.readBitsFast(sz)
                if err != nil {
                        bits, err = b.readBits(sz)
                }
                if err != nil {
                        return 0, err
                }
        }

        return bits, nil
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// The code in this file was largely written by Damian Gryski as part of
// https://github.com/dgryski/go-tsz and published under the license below.
// It was modified to accommodate reading from byte slices without modifying
// the underlying bytes, which would panic when reading from mmap'd
// read-only byte slices.

// Copyright (c) 2015,2016 Damian Gryski <damian@gryski.com>
// All rights reserved.

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:

// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

package chunkenc

import (
        "encoding/binary"
        "math"
        "math/bits"

        "github.com/prometheus/prometheus/model/histogram"
)

const (
        chunkCompactCapacityThreshold = 32
)

// XORChunk holds XOR encoded sample data.
type XORChunk struct {
        b bstream
}

// NewXORChunk returns a new chunk with XOR encoding of the given size.
func NewXORChunk() *XORChunk {
        b := make([]byte, 2, 128)
        return &XORChunk{b: bstream{stream: b, count: 0}}
}

func (c *XORChunk) Reset(stream []byte) {
        c.b.Reset(stream)
}

// Encoding returns the encoding type.
func (c *XORChunk) Encoding() Encoding {
        return EncXOR
}

// Bytes returns the underlying byte slice of the chunk.
func (c *XORChunk) Bytes() []byte {
        return c.b.bytes()
}

// NumSamples returns the number of samples in the chunk.
func (c *XORChunk) NumSamples() int {
        return int(binary.BigEndian.Uint16(c.Bytes()))
}

// Compact implements the Chunk interface.
func (c *XORChunk) Compact() {
        if l := len(c.b.stream); cap(c.b.stream) > l+chunkCompactCapacityThreshold {
                buf := make([]byte, l)
                copy(buf, c.b.stream)
                c.b.stream = buf
        }
}

// Appender implements the Chunk interface.
// It is not valid to call Appender() multiple times concurrently or to use multiple
// Appenders on the same chunk.
func (c *XORChunk) Appender() (Appender, error) {
        it := c.iterator(nil)

        // To get an appender we must know the state it would have if we had
        // appended all existing data from scratch.
        // We iterate through the end and populate via the iterator's state.
        for it.Next() != ValNone {
        }
        if err := it.Err(); err != nil {
                return nil, err
        }

        a := &xorAppender{
                b:        &c.b,
                t:        it.t,
                v:        it.val,
                tDelta:   it.tDelta,
                leading:  it.leading,
                trailing: it.trailing,
        }
        if it.numTotal == 0 {
                a.leading = 0xff
        }
        return a, nil
}

func (c *XORChunk) iterator(it Iterator) *xorIterator {
        if xorIter, ok := it.(*xorIterator); ok {
                xorIter.Reset(c.b.bytes())
                return xorIter
        }
        return &xorIterator{
                // The first 2 bytes contain chunk headers.
                // We skip that for actual samples.
                br:       newBReader(c.b.bytes()[2:]),
                numTotal: binary.BigEndian.Uint16(c.b.bytes()),
                t:        math.MinInt64,
        }
}

// Iterator implements the Chunk interface.
// Iterator() must not be called concurrently with any modifications to the chunk,
// but after it returns you can use an Iterator concurrently with an Appender or
// other Iterators.
func (c *XORChunk) Iterator(it Iterator) Iterator {
        return c.iterator(it)
}

type xorAppender struct {
        b *bstream

        t      int64
        v      float64
        tDelta uint64

        leading  uint8
        trailing uint8
}

func (a *xorAppender) Append(t int64, v float64) {
        var tDelta uint64
        num := binary.BigEndian.Uint16(a.b.bytes())
        switch num {
        case 0:
                buf := make([]byte, binary.MaxVarintLen64)
                for _, b := range buf[:binary.PutVarint(buf, t)] {
                        a.b.writeByte(b)
                }
                a.b.writeBits(math.Float64bits(v), 64)
        case 1:
                tDelta = uint64(t - a.t)

                buf := make([]byte, binary.MaxVarintLen64)
                for _, b := range buf[:binary.PutUvarint(buf, tDelta)] {
                        a.b.writeByte(b)
                }

                a.writeVDelta(v)
        default:
                tDelta = uint64(t - a.t)
                dod := int64(tDelta - a.tDelta)

                // Gorilla has a max resolution of seconds, Prometheus milliseconds.
                // Thus we use higher value range steps with larger bit size.
                //
                // TODO(beorn7): This seems to needlessly jump to large bit
                // sizes even for very small deviations from zero. Timestamp
                // compression can probably benefit from some smaller bit
                // buckets. See also what was done for histogram encoding in
                // varbit.go.
                switch {
                case dod == 0:
                        a.b.writeBit(zero)
                case bitRange(dod, 14):
                        a.b.writeBits(0b10, 2)
                        a.b.writeBits(uint64(dod), 14)
                case bitRange(dod, 17):
                        a.b.writeBits(0b110, 3)
                        a.b.writeBits(uint64(dod), 17)
                case bitRange(dod, 20):
                        a.b.writeBits(0b1110, 4)
                        a.b.writeBits(uint64(dod), 20)
                default:
                        a.b.writeBits(0b1111, 4)
                        a.b.writeBits(uint64(dod), 64)
                }

                a.writeVDelta(v)
        }

        a.t = t
        a.v = v
        binary.BigEndian.PutUint16(a.b.bytes(), num+1)
        a.tDelta = tDelta
}

// bitRange returns whether the given integer can be represented by nbits.
// See docs/bstream.md.
func bitRange(x int64, nbits uint8) bool {
        return -((1<<(nbits-1))-1) <= x && x <= 1<<(nbits-1)
}

func (a *xorAppender) writeVDelta(v float64) {
        xorWrite(a.b, v, a.v, &a.leading, &a.trailing)
}

func (a *xorAppender) AppendHistogram(*HistogramAppender, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) {
        panic("appended a histogram sample to a float chunk")
}

func (a *xorAppender) AppendFloatHistogram(*FloatHistogramAppender, int64, *histogram.FloatHistogram, bool) (Chunk, bool, Appender, error) {
        panic("appended a float histogram sample to a float chunk")
}

type xorIterator struct {
        br       bstreamReader
        numTotal uint16
        numRead  uint16

        t   int64
        val float64

        leading  uint8
        trailing uint8

        tDelta uint64
        err    error
}

func (it *xorIterator) Seek(t int64) ValueType {
        if it.err != nil {
                return ValNone
        }

        for t > it.t || it.numRead == 0 {
                if it.Next() == ValNone {
                        return ValNone
                }
        }
        return ValFloat
}

func (it *xorIterator) At() (int64, float64) {
        return it.t, it.val
}

func (it *xorIterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) {
        panic("cannot call xorIterator.AtHistogram")
}

func (it *xorIterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        panic("cannot call xorIterator.AtFloatHistogram")
}

func (it *xorIterator) AtT() int64 {
        return it.t
}

func (it *xorIterator) Err() error {
        return it.err
}

func (it *xorIterator) Reset(b []byte) {
        // The first 2 bytes contain chunk headers.
        // We skip that for actual samples.
        it.br = newBReader(b[2:])
        it.numTotal = binary.BigEndian.Uint16(b)

        it.numRead = 0
        it.t = 0
        it.val = 0
        it.leading = 0
        it.trailing = 0
        it.tDelta = 0
        it.err = nil
}

func (it *xorIterator) Next() ValueType {
        if it.err != nil || it.numRead == it.numTotal {
                return ValNone
        }

        if it.numRead == 0 {
                t, err := binary.ReadVarint(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                v, err := it.br.readBits(64)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.t = t
                it.val = math.Float64frombits(v)

                it.numRead++
                return ValFloat
        }
        if it.numRead == 1 {
                tDelta, err := binary.ReadUvarint(&it.br)
                if err != nil {
                        it.err = err
                        return ValNone
                }
                it.tDelta = tDelta
                it.t += int64(it.tDelta)

                return it.readValue()
        }

        var d byte
        // read delta-of-delta
        for i := 0; i < 4; i++ {
                d <<= 1
                bit, err := it.br.readBitFast()
                if err != nil {
                        bit, err = it.br.readBit()
                }
                if err != nil {
                        it.err = err
                        return ValNone
                }
                if bit == zero {
                        break
                }
                d |= 1
        }
        var sz uint8
        var dod int64
        switch d {
        case 0b0:
                // dod == 0
        case 0b10:
                sz = 14
        case 0b110:
                sz = 17
        case 0b1110:
                sz = 20
        case 0b1111:
                // Do not use fast because it's very unlikely it will succeed.
                bits, err := it.br.readBits(64)
                if err != nil {
                        it.err = err
                        return ValNone
                }

                dod = int64(bits)
        }

        if sz != 0 {
                bits, err := it.br.readBitsFast(sz)
                if err != nil {
                        bits, err = it.br.readBits(sz)
                }
                if err != nil {
                        it.err = err
                        return ValNone
                }

                // Account for negative numbers, which come back as high unsigned numbers.
                // See docs/bstream.md.
                if bits > (1 << (sz - 1)) {
                        bits -= 1 << sz
                }
                dod = int64(bits)
        }

        it.tDelta = uint64(int64(it.tDelta) + dod)
        it.t += int64(it.tDelta)

        return it.readValue()
}

func (it *xorIterator) readValue() ValueType {
        err := xorRead(&it.br, &it.val, &it.leading, &it.trailing)
        if err != nil {
                it.err = err
                return ValNone
        }
        it.numRead++
        return ValFloat
}

func xorWrite(b *bstream, newValue, currentValue float64, leading, trailing *uint8) {
        delta := math.Float64bits(newValue) ^ math.Float64bits(currentValue)

        if delta == 0 {
                b.writeBit(zero)
                return
        }
        b.writeBit(one)

        newLeading := uint8(bits.LeadingZeros64(delta))
        newTrailing := uint8(bits.TrailingZeros64(delta))

        // Clamp number of leading zeros to avoid overflow when encoding.
        if newLeading >= 32 {
                newLeading = 31
        }

        if *leading != 0xff && newLeading >= *leading && newTrailing >= *trailing {
                // In this case, we stick with the current leading/trailing.
                b.writeBit(zero)
                b.writeBits(delta>>*trailing, 64-int(*leading)-int(*trailing))
                return
        }

        // Update leading/trailing for the caller.
        *leading, *trailing = newLeading, newTrailing

        b.writeBit(one)
        b.writeBits(uint64(newLeading), 5)

        // Note that if newLeading == newTrailing == 0, then sigbits == 64. But
        // that value doesn't actually fit into the 6 bits we have.  Luckily, we
        // never need to encode 0 significant bits, since that would put us in
        // the other case (vdelta == 0).  So instead we write out a 0 and adjust
        // it back to 64 on unpacking.
        sigbits := 64 - newLeading - newTrailing
        b.writeBits(uint64(sigbits), 6)
        b.writeBits(delta>>newTrailing, int(sigbits))
}

func xorRead(br *bstreamReader, value *float64, leading, trailing *uint8) error {
        bit, err := br.readBitFast()
        if err != nil {
                bit, err = br.readBit()
        }
        if err != nil {
                return err
        }
        if bit == zero {
                return nil
        }
        bit, err = br.readBitFast()
        if err != nil {
                bit, err = br.readBit()
        }
        if err != nil {
                return err
        }

        var (
                bits                           uint64
                newLeading, newTrailing, mbits uint8
        )

        if bit == zero {
                // Reuse leading/trailing zero bits.
                newLeading, newTrailing = *leading, *trailing
                mbits = 64 - newLeading - newTrailing
        } else {
                bits, err = br.readBitsFast(5)
                if err != nil {
                        bits, err = br.readBits(5)
                }
                if err != nil {
                        return err
                }
                newLeading = uint8(bits)

                bits, err = br.readBitsFast(6)
                if err != nil {
                        bits, err = br.readBits(6)
                }
                if err != nil {
                        return err
                }
                mbits = uint8(bits)
                // 0 significant bits here means we overflowed and we actually
                // need 64; see comment in xrWrite.
                if mbits == 0 {
                        mbits = 64
                }
                newTrailing = 64 - newLeading - mbits
                // Update leading/trailing zero bits for the caller.
                *leading, *trailing = newLeading, newTrailing
        }
        bits, err = br.readBitsFast(mbits)
        if err != nil {
                bits, err = br.readBits(mbits)
        }
        if err != nil {
                return err
        }
        vbits := math.Float64bits(*value)
        vbits ^= bits << newTrailing
        *value = math.Float64frombits(vbits)
        return nil
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunks

import (
        "errors"
        "sync"
        "time"

        "github.com/prometheus/client_golang/prometheus"

        "github.com/prometheus/prometheus/tsdb/chunkenc"
)

const (
        // Minimum recorded peak since the last shrinking of chunkWriteQueue.chunkrefMap to shrink it again.
        chunkRefMapShrinkThreshold = 1000

        // Minimum interval between shrinking of chunkWriteQueue.chunkRefMap.
        chunkRefMapMinShrinkInterval = 10 * time.Minute

        // Maximum size of segment used by job queue (number of elements). With chunkWriteJob being 64 bytes,
        // this will use ~512 KiB for empty queue.
        maxChunkQueueSegmentSize = 8192
)

type chunkWriteJob struct {
        cutFile   bool
        seriesRef HeadSeriesRef
        mint      int64
        maxt      int64
        chk       chunkenc.Chunk
        ref       ChunkDiskMapperRef
        isOOO     bool
        callback  func(error)
}

// chunkWriteQueue is a queue for writing chunks to disk in a non-blocking fashion.
// Chunks that shall be written get added to the queue, which is consumed asynchronously.
// Adding jobs to the queue is non-blocking as long as the queue isn't full.
type chunkWriteQueue struct {
        jobs *writeJobQueue

        chunkRefMapMtx        sync.RWMutex
        chunkRefMap           map[ChunkDiskMapperRef]chunkenc.Chunk
        chunkRefMapPeakSize   int       // Largest size that chunkRefMap has grown to since the last time we shrank it.
        chunkRefMapLastShrink time.Time // When the chunkRefMap has been shrunk the last time.

        // isRunningMtx serves two purposes:
        // 1. It protects isRunning field.
        // 2. It serializes adding of jobs to the chunkRefMap in addJob() method. If jobs channel is full then addJob() will block
        // while holding this mutex, which guarantees that chunkRefMap won't ever grow beyond the queue size + 1.
        isRunningMtx sync.Mutex
        isRunning    bool // Used to prevent that new jobs get added to the queue when the chan is already closed.

        workerWg sync.WaitGroup

        writeChunk writeChunkF

        // Keeping separate counters instead of only a single CounterVec to improve the performance of the critical
        // addJob() method which otherwise would need to perform a WithLabelValues call on the CounterVec.
        adds      prometheus.Counter
        gets      prometheus.Counter
        completed prometheus.Counter
        shrink    prometheus.Counter
}

// writeChunkF is a function which writes chunks, it is dynamic to allow mocking in tests.
type writeChunkF func(HeadSeriesRef, int64, int64, chunkenc.Chunk, ChunkDiskMapperRef, bool, bool) error

func newChunkWriteQueue(reg prometheus.Registerer, size int, writeChunk writeChunkF) *chunkWriteQueue {
        counters := prometheus.NewCounterVec(
                prometheus.CounterOpts{
                        Name: "prometheus_tsdb_chunk_write_queue_operations_total",
                        Help: "Number of operations on the chunk_write_queue.",
                },
                []string{"operation"},
        )

        segmentSize := size
        if segmentSize > maxChunkQueueSegmentSize {
                segmentSize = maxChunkQueueSegmentSize
        }

        q := &chunkWriteQueue{
                jobs:                  newWriteJobQueue(size, segmentSize),
                chunkRefMap:           make(map[ChunkDiskMapperRef]chunkenc.Chunk),
                chunkRefMapLastShrink: time.Now(),
                writeChunk:            writeChunk,

                adds:      counters.WithLabelValues("add"),
                gets:      counters.WithLabelValues("get"),
                completed: counters.WithLabelValues("complete"),
                shrink:    counters.WithLabelValues("shrink"),
        }

        if reg != nil {
                reg.MustRegister(counters)
        }

        q.start()
        return q
}

func (c *chunkWriteQueue) start() {
        c.workerWg.Add(1)
        go func() {
                defer c.workerWg.Done()

                for {
                        job, ok := c.jobs.pop()
                        if !ok {
                                return
                        }

                        c.processJob(job)
                }
        }()

        c.isRunningMtx.Lock()
        c.isRunning = true
        c.isRunningMtx.Unlock()
}

func (c *chunkWriteQueue) processJob(job chunkWriteJob) {
        err := c.writeChunk(job.seriesRef, job.mint, job.maxt, job.chk, job.ref, job.isOOO, job.cutFile)
        if job.callback != nil {
                job.callback(err)
        }

        c.chunkRefMapMtx.Lock()
        defer c.chunkRefMapMtx.Unlock()

        delete(c.chunkRefMap, job.ref)

        c.completed.Inc()

        c.shrinkChunkRefMap()
}

// shrinkChunkRefMap checks whether the conditions to shrink the chunkRefMap are met,
// if so chunkRefMap is reinitialized. The chunkRefMapMtx must be held when calling this method.
//
// We do this because Go runtime doesn't release internal memory used by map after map has been emptied.
// To achieve that we create new map instead and throw the old one away.
func (c *chunkWriteQueue) shrinkChunkRefMap() {
        if len(c.chunkRefMap) > 0 {
                // Can't shrink it while there is data in it.
                return
        }

        if c.chunkRefMapPeakSize < chunkRefMapShrinkThreshold {
                // Not shrinking it because it has not grown to the minimum threshold yet.
                return
        }

        now := time.Now()

        if now.Sub(c.chunkRefMapLastShrink) < chunkRefMapMinShrinkInterval {
                // Not shrinking it because the minimum duration between shrink-events has not passed yet.
                return
        }

        // Re-initialize the chunk ref map to half of the peak size that it has grown to since the last re-init event.
        // We are trying to hit the sweet spot in the trade-off between initializing it to a very small size
        // potentially resulting in many allocations to re-grow it, and initializing it to a large size potentially
        // resulting in unused allocated memory.
        c.chunkRefMap = make(map[ChunkDiskMapperRef]chunkenc.Chunk, c.chunkRefMapPeakSize/2)

        c.chunkRefMapPeakSize = 0
        c.chunkRefMapLastShrink = now
        c.shrink.Inc()
}

func (c *chunkWriteQueue) addJob(job chunkWriteJob) (err error) {
        defer func() {
                if err == nil {
                        c.adds.Inc()
                }
        }()

        c.isRunningMtx.Lock()
        defer c.isRunningMtx.Unlock()

        if !c.isRunning {
                return errors.New("queue is not running")
        }

        c.chunkRefMapMtx.Lock()
        c.chunkRefMap[job.ref] = job.chk

        // Keep track of the peak usage of c.chunkRefMap.
        if len(c.chunkRefMap) > c.chunkRefMapPeakSize {
                c.chunkRefMapPeakSize = len(c.chunkRefMap)
        }
        c.chunkRefMapMtx.Unlock()

        if ok := c.jobs.push(job); !ok {
                c.chunkRefMapMtx.Lock()
                delete(c.chunkRefMap, job.ref)
                c.chunkRefMapMtx.Unlock()

                return errors.New("queue is closed")
        }

        return nil
}

func (c *chunkWriteQueue) get(ref ChunkDiskMapperRef) chunkenc.Chunk {
        c.chunkRefMapMtx.RLock()
        defer c.chunkRefMapMtx.RUnlock()

        chk, ok := c.chunkRefMap[ref]
        if ok {
                c.gets.Inc()
        }

        return chk
}

func (c *chunkWriteQueue) stop() {
        c.isRunningMtx.Lock()
        defer c.isRunningMtx.Unlock()

        if !c.isRunning {
                return
        }

        c.isRunning = false

        c.jobs.close()

        c.workerWg.Wait()
}

func (c *chunkWriteQueue) queueIsEmpty() bool {
        return c.queueSize() == 0
}

func (c *chunkWriteQueue) queueIsFull() bool {
        // When the queue is full and blocked on the writer the chunkRefMap has one more job than the cap of the jobCh
        // because one job is currently being processed and blocked in the writer.
        return c.queueSize() == c.jobs.maxSize+1
}

func (c *chunkWriteQueue) queueSize() int {
        c.chunkRefMapMtx.Lock()
        defer c.chunkRefMapMtx.Unlock()

        // Looking at chunkRefMap instead of jobCh because the job is popped from the chan before it has
        // been fully processed, it remains in the chunkRefMap until the processing is complete.
        return len(c.chunkRefMap)
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunks

import (
        "bufio"
        "encoding/binary"
        "fmt"
        "hash"
        "hash/crc32"
        "io"
        "os"
        "path/filepath"
        "strconv"

        "github.com/prometheus/prometheus/tsdb/chunkenc"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
)

// Segment header fields constants.
const (
        // MagicChunks is 4 bytes at the head of a series file.
        MagicChunks = 0x85BD40DD
        // MagicChunksSize is the size in bytes of MagicChunks.
        MagicChunksSize          = 4
        chunksFormatV1           = 1
        ChunksFormatVersionSize  = 1
        segmentHeaderPaddingSize = 3
        // SegmentHeaderSize defines the total size of the header part.
        SegmentHeaderSize = MagicChunksSize + ChunksFormatVersionSize + segmentHeaderPaddingSize
)

// Chunk fields constants.
const (
        // MaxChunkLengthFieldSize defines the maximum size of the data length part.
        MaxChunkLengthFieldSize = binary.MaxVarintLen32
        // ChunkEncodingSize defines the size of the chunk encoding part.
        ChunkEncodingSize = 1
)

// ChunkRef is a generic reference for reading chunk data. In prometheus it
// is either a HeadChunkRef or BlockChunkRef, though other implementations
// may have their own reference types.
type ChunkRef uint64

// HeadSeriesRef refers to in-memory series.
type HeadSeriesRef uint64

// HeadChunkRef packs a HeadSeriesRef and a ChunkID into a global 8 Byte ID.
// The HeadSeriesRef and ChunkID may not exceed 5 and 3 bytes respectively.
type HeadChunkRef uint64

func NewHeadChunkRef(hsr HeadSeriesRef, chunkID HeadChunkID) HeadChunkRef {
        if hsr > (1<<40)-1 {
                panic("series ID exceeds 5 bytes")
        }
        if chunkID > (1<<24)-1 {
                panic("chunk ID exceeds 3 bytes")
        }
        return HeadChunkRef(uint64(hsr<<24) | uint64(chunkID))
}

func (p HeadChunkRef) Unpack() (HeadSeriesRef, HeadChunkID) {
        return HeadSeriesRef(p >> 24), HeadChunkID(p<<40) >> 40
}

// HeadChunkID refers to a specific chunk in a series (memSeries) in the Head.
// Each memSeries has its own monotonically increasing number to refer to its chunks.
// If the HeadChunkID value is...
//   - memSeries.firstChunkID+len(memSeries.mmappedChunks), it's the head chunk.
//   - less than the above, but >= memSeries.firstID, then it's
//     memSeries.mmappedChunks[i] where i = HeadChunkID - memSeries.firstID.
//
// If memSeries.headChunks is non-nil it points to a *memChunk that holds the current
// "open" (accepting appends) instance. *memChunk is a linked list and memChunk.next pointer
// might link to the older *memChunk instance.
// If there are multiple *memChunk instances linked to each other from memSeries.headChunks
// they will be m-mapped as soon as possible leaving only "open" *memChunk instance.
//
// Example:
// assume a memSeries.firstChunkID=7 and memSeries.mmappedChunks=[p5,p6,p7,p8,p9].
//
//        | HeadChunkID value | refers to ...                                                                          |
//        |-------------------|----------------------------------------------------------------------------------------|
//        |               0-6 | chunks that have been compacted to blocks, these won't return data for queries in Head |
//        |              7-11 | memSeries.mmappedChunks[i] where i is 0 to 4.                                          |
//        |                12 |                                                         *memChunk{next: nil}
//        |                13 |                                         *memChunk{next: ^}
//        |                14 | memSeries.headChunks -> *memChunk{next: ^}
type HeadChunkID uint64

// BlockChunkRef refers to a chunk within a persisted block.
// The upper 4 bytes are for the segment index and
// the lower 4 bytes are for the segment offset where the data starts for this chunk.
type BlockChunkRef uint64

// NewBlockChunkRef packs the file index and byte offset into a BlockChunkRef.
func NewBlockChunkRef(fileIndex, fileOffset uint64) BlockChunkRef {
        return BlockChunkRef(fileIndex<<32 | fileOffset)
}

func (b BlockChunkRef) Unpack() (int, int) {
        sgmIndex := int(b >> 32)
        chkStart := int((b << 32) >> 32)
        return sgmIndex, chkStart
}

// Meta holds information about one or more chunks.
// For examples of when chunks.Meta could refer to multiple chunks, see
// ChunkReader.ChunkOrIterable().
type Meta struct {
        // Ref and Chunk hold either a reference that can be used to retrieve
        // chunk data or the data itself.
        // If Chunk is nil, call ChunkReader.ChunkOrIterable(Meta.Ref) to get the
        // chunk and assign it to the Chunk field. If an iterable is returned from
        // that method, then it may not be possible to set Chunk as the iterable
        // might form several chunks.
        Ref   ChunkRef
        Chunk chunkenc.Chunk

        // Time range the data covers.
        // When MaxTime == math.MaxInt64 the chunk is still open and being appended to.
        MinTime, MaxTime int64

        // OOOLastRef, OOOLastMinTime and OOOLastMaxTime are kept as markers for
        // overlapping chunks.
        // These fields point to the last created out of order Chunk (the head) that existed
        // when Series() was called and was overlapping.
        // Series() and Chunk() method responses should be consistent for the same
        // query even if new data is added in between the calls.
        OOOLastRef                     ChunkRef
        OOOLastMinTime, OOOLastMaxTime int64
}

// ChunkFromSamples requires all samples to have the same type.
func ChunkFromSamples(s []Sample) (Meta, error) {
        return ChunkFromSamplesGeneric(SampleSlice(s))
}

// ChunkFromSamplesGeneric requires all samples to have the same type.
func ChunkFromSamplesGeneric(s Samples) (Meta, error) {
        emptyChunk := Meta{Chunk: chunkenc.NewXORChunk()}
        mint, maxt := int64(0), int64(0)

        if s.Len() > 0 {
                mint, maxt = s.Get(0).T(), s.Get(s.Len()-1).T()
        }

        if s.Len() == 0 {
                return emptyChunk, nil
        }

        sampleType := s.Get(0).Type()
        c, err := chunkenc.NewEmptyChunk(sampleType.ChunkEncoding())
        if err != nil {
                return Meta{}, err
        }

        ca, _ := c.Appender()
        var newChunk chunkenc.Chunk

        for i := 0; i < s.Len(); i++ {
                switch sampleType {
                case chunkenc.ValFloat:
                        ca.Append(s.Get(i).T(), s.Get(i).F())
                case chunkenc.ValHistogram:
                        newChunk, _, ca, err = ca.AppendHistogram(nil, s.Get(i).T(), s.Get(i).H(), false)
                        if err != nil {
                                return emptyChunk, err
                        }
                        if newChunk != nil {
                                return emptyChunk, fmt.Errorf("did not expect to start a second chunk")
                        }
                case chunkenc.ValFloatHistogram:
                        newChunk, _, ca, err = ca.AppendFloatHistogram(nil, s.Get(i).T(), s.Get(i).FH(), false)
                        if err != nil {
                                return emptyChunk, err
                        }
                        if newChunk != nil {
                                return emptyChunk, fmt.Errorf("did not expect to start a second chunk")
                        }
                default:
                        panic(fmt.Sprintf("unknown sample type %s", sampleType.String()))
                }
        }
        return Meta{
                MinTime: mint,
                MaxTime: maxt,
                Chunk:   c,
        }, nil
}

// ChunkMetasToSamples converts a slice of chunk meta data to a slice of samples.
// Used in tests to compare the content of chunks.
func ChunkMetasToSamples(chunks []Meta) (result []Sample) {
        if len(chunks) == 0 {
                return
        }

        for _, chunk := range chunks {
                it := chunk.Chunk.Iterator(nil)
                for vt := it.Next(); vt != chunkenc.ValNone; vt = it.Next() {
                        switch vt {
                        case chunkenc.ValFloat:
                                t, v := it.At()
                                result = append(result, sample{t: t, f: v})
                        case chunkenc.ValHistogram:
                                t, h := it.AtHistogram(nil)
                                result = append(result, sample{t: t, h: h})
                        case chunkenc.ValFloatHistogram:
                                t, fh := it.AtFloatHistogram(nil)
                                result = append(result, sample{t: t, fh: fh})
                        default:
                                panic("unexpected value type")
                        }
                }
        }
        return
}

// Iterator iterates over the chunks of a single time series.
type Iterator interface {
        // At returns the current meta.
        // It depends on the implementation whether the chunk is populated or not.
        At() Meta
        // Next advances the iterator by one.
        Next() bool
        // Err returns optional error if Next is false.
        Err() error
}

// writeHash writes the chunk encoding and raw data into the provided hash.
func (cm *Meta) writeHash(h hash.Hash, buf []byte) error {
        buf = append(buf[:0], byte(cm.Chunk.Encoding()))
        if _, err := h.Write(buf[:1]); err != nil {
                return err
        }
        if _, err := h.Write(cm.Chunk.Bytes()); err != nil {
                return err
        }
        return nil
}

// OverlapsClosedInterval Returns true if the chunk overlaps [mint, maxt].
func (cm *Meta) OverlapsClosedInterval(mint, maxt int64) bool {
        // The chunk itself is a closed interval [cm.MinTime, cm.MaxTime].
        return cm.MinTime <= maxt && mint <= cm.MaxTime
}

var errInvalidSize = fmt.Errorf("invalid size")

var castagnoliTable *crc32.Table

func init() {
        castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
}

// newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
// polynomial may be easily changed in one location at a later time, if necessary.
func newCRC32() hash.Hash32 {
        return crc32.New(castagnoliTable)
}

// Check if the CRC of data matches that stored in sum, computed when the chunk was stored.
func checkCRC32(data, sum []byte) error {
        got := crc32.Checksum(data, castagnoliTable)
        // This combination of shifts is the inverse of digest.Sum() in go/src/hash/crc32.
        want := uint32(sum[0])<<24 + uint32(sum[1])<<16 + uint32(sum[2])<<8 + uint32(sum[3])
        if got != want {
                return fmt.Errorf("checksum mismatch expected:%x, actual:%x", want, got)
        }
        return nil
}

// Writer implements the ChunkWriter interface for the standard
// serialization format.
type Writer struct {
        dirFile *os.File
        files   []*os.File
        wbuf    *bufio.Writer
        n       int64
        crc32   hash.Hash
        buf     [binary.MaxVarintLen32]byte

        segmentSize int64
}

const (
        // DefaultChunkSegmentSize is the default chunks segment size.
        DefaultChunkSegmentSize = 512 * 1024 * 1024
)

// NewWriterWithSegSize returns a new writer against the given directory
// and allows setting a custom size for the segments.
func NewWriterWithSegSize(dir string, segmentSize int64) (*Writer, error) {
        return newWriter(dir, segmentSize)
}

// NewWriter returns a new writer against the given directory
// using the default segment size.
func NewWriter(dir string) (*Writer, error) {
        return newWriter(dir, DefaultChunkSegmentSize)
}

func newWriter(dir string, segmentSize int64) (*Writer, error) {
        if segmentSize <= 0 {
                segmentSize = DefaultChunkSegmentSize
        }

        if err := os.MkdirAll(dir, 0o777); err != nil {
                return nil, err
        }
        dirFile, err := fileutil.OpenDir(dir)
        if err != nil {
                return nil, err
        }
        return &Writer{
                dirFile:     dirFile,
                n:           0,
                crc32:       newCRC32(),
                segmentSize: segmentSize,
        }, nil
}

func (w *Writer) tail() *os.File {
        if len(w.files) == 0 {
                return nil
        }
        return w.files[len(w.files)-1]
}

// finalizeTail writes all pending data to the current tail file,
// truncates its size, and closes it.
func (w *Writer) finalizeTail() error {
        tf := w.tail()
        if tf == nil {
                return nil
        }

        if err := w.wbuf.Flush(); err != nil {
                return err
        }
        if err := tf.Sync(); err != nil {
                return err
        }
        // As the file was pre-allocated, we truncate any superfluous zero bytes.
        off, err := tf.Seek(0, io.SeekCurrent)
        if err != nil {
                return err
        }
        if err := tf.Truncate(off); err != nil {
                return err
        }

        return tf.Close()
}

func (w *Writer) cut() error {
        // Sync current tail to disk and close.
        if err := w.finalizeTail(); err != nil {
                return err
        }

        n, f, _, err := cutSegmentFile(w.dirFile, MagicChunks, chunksFormatV1, w.segmentSize)
        if err != nil {
                return err
        }
        w.n = int64(n)

        w.files = append(w.files, f)
        if w.wbuf != nil {
                w.wbuf.Reset(f)
        } else {
                w.wbuf = bufio.NewWriterSize(f, 8*1024*1024)
        }

        return nil
}

func cutSegmentFile(dirFile *os.File, magicNumber uint32, chunksFormat byte, allocSize int64) (headerSize int, newFile *os.File, seq int, returnErr error) {
        p, seq, err := nextSequenceFile(dirFile.Name())
        if err != nil {
                return 0, nil, 0, fmt.Errorf("next sequence file: %w", err)
        }
        ptmp := p + ".tmp"
        f, err := os.OpenFile(ptmp, os.O_WRONLY|os.O_CREATE, 0o666)
        if err != nil {
                return 0, nil, 0, fmt.Errorf("open temp file: %w", err)
        }
        defer func() {
                if returnErr != nil {
                        errs := tsdb_errors.NewMulti(returnErr)
                        if f != nil {
                                errs.Add(f.Close())
                        }
                        // Calling RemoveAll on a non-existent file does not return error.
                        errs.Add(os.RemoveAll(ptmp))
                        returnErr = errs.Err()
                }
        }()
        if allocSize > 0 {
                if err = fileutil.Preallocate(f, allocSize, true); err != nil {
                        return 0, nil, 0, fmt.Errorf("preallocate: %w", err)
                }
        }
        if err = dirFile.Sync(); err != nil {
                return 0, nil, 0, fmt.Errorf("sync directory: %w", err)
        }

        // Write header metadata for new file.
        metab := make([]byte, SegmentHeaderSize)
        binary.BigEndian.PutUint32(metab[:MagicChunksSize], magicNumber)
        metab[4] = chunksFormat

        n, err := f.Write(metab)
        if err != nil {
                return 0, nil, 0, fmt.Errorf("write header: %w", err)
        }
        if err := f.Close(); err != nil {
                return 0, nil, 0, fmt.Errorf("close temp file: %w", err)
        }
        f = nil

        if err := fileutil.Rename(ptmp, p); err != nil {
                return 0, nil, 0, fmt.Errorf("replace file: %w", err)
        }

        f, err = os.OpenFile(p, os.O_WRONLY, 0o666)
        if err != nil {
                return 0, nil, 0, fmt.Errorf("open final file: %w", err)
        }
        // Skip header for further writes.
        if _, err := f.Seek(int64(n), 0); err != nil {
                return 0, nil, 0, fmt.Errorf("seek in final file: %w", err)
        }
        return n, f, seq, nil
}

func (w *Writer) write(b []byte) error {
        n, err := w.wbuf.Write(b)
        w.n += int64(n)
        return err
}

// WriteChunks writes as many chunks as possible to the current segment,
// cuts a new segment when the current segment is full and
// writes the rest of the chunks in the new segment.
func (w *Writer) WriteChunks(chks ...Meta) error {
        var (
                batchSize  = int64(0)
                batchStart = 0
                batches    = make([][]Meta, 1)
                batchID    = 0
                firstBatch = true
        )

        for i, chk := range chks {
                // Each chunk contains: data length + encoding + the data itself + crc32
                chkSize := int64(MaxChunkLengthFieldSize) // The data length is a variable length field so use the maximum possible value.
                chkSize += ChunkEncodingSize              // The chunk encoding.
                chkSize += int64(len(chk.Chunk.Bytes()))  // The data itself.
                chkSize += crc32.Size                     // The 4 bytes of crc32.
                batchSize += chkSize

                // Cut a new batch when it is not the first chunk(to avoid empty segments) and
                // the batch is too large to fit in the current segment.
                cutNewBatch := (i != 0) && (batchSize+SegmentHeaderSize > w.segmentSize)

                // If the segment already has some data then
                // the first batch size calculation should account for that.
                if firstBatch && w.n > SegmentHeaderSize {
                        cutNewBatch = batchSize+w.n > w.segmentSize
                        if cutNewBatch {
                                firstBatch = false
                        }
                }

                if cutNewBatch {
                        batchStart = i
                        batches = append(batches, []Meta{})
                        batchID++
                        batchSize = chkSize
                }
                batches[batchID] = chks[batchStart : i+1]
        }

        // Create a new segment when one doesn't already exist.
        if w.n == 0 {
                if err := w.cut(); err != nil {
                        return err
                }
        }

        for i, chks := range batches {
                if err := w.writeChunks(chks); err != nil {
                        return err
                }
                // Cut a new segment only when there are more chunks to write.
                // Avoid creating a new empty segment at the end of the write.
                if i < len(batches)-1 {
                        if err := w.cut(); err != nil {
                                return err
                        }
                }
        }
        return nil
}

// writeChunks writes the chunks into the current segment irrespective
// of the configured segment size limit. A segment should have been already
// started before calling this.
func (w *Writer) writeChunks(chks []Meta) error {
        if len(chks) == 0 {
                return nil
        }

        seq := uint64(w.seq())
        for i := range chks {
                chk := &chks[i]

                chk.Ref = ChunkRef(NewBlockChunkRef(seq, uint64(w.n)))

                n := binary.PutUvarint(w.buf[:], uint64(len(chk.Chunk.Bytes())))

                if err := w.write(w.buf[:n]); err != nil {
                        return err
                }
                w.buf[0] = byte(chk.Chunk.Encoding())
                if err := w.write(w.buf[:1]); err != nil {
                        return err
                }
                if err := w.write(chk.Chunk.Bytes()); err != nil {
                        return err
                }

                w.crc32.Reset()
                if err := chk.writeHash(w.crc32, w.buf[:]); err != nil {
                        return err
                }
                if err := w.write(w.crc32.Sum(w.buf[:0])); err != nil {
                        return err
                }
        }
        return nil
}

func (w *Writer) seq() int {
        return len(w.files) - 1
}

func (w *Writer) Close() error {
        if err := w.finalizeTail(); err != nil {
                return err
        }

        // close dir file (if not windows platform will fail on rename)
        return w.dirFile.Close()
}

// ByteSlice abstracts a byte slice.
type ByteSlice interface {
        Len() int
        Range(start, end int) []byte
}

type realByteSlice []byte

func (b realByteSlice) Len() int {
        return len(b)
}

func (b realByteSlice) Range(start, end int) []byte {
        return b[start:end]
}

// Reader implements a ChunkReader for a serialized byte stream
// of series data.
type Reader struct {
        // The underlying bytes holding the encoded series data.
        // Each slice holds the data for a different segment.
        bs   []ByteSlice
        cs   []io.Closer // Closers for resources behind the byte slices.
        size int64       // The total size of bytes in the reader.
        pool chunkenc.Pool
}

func newReader(bs []ByteSlice, cs []io.Closer, pool chunkenc.Pool) (*Reader, error) {
        cr := Reader{pool: pool, bs: bs, cs: cs}
        for i, b := range cr.bs {
                if b.Len() < SegmentHeaderSize {
                        return nil, fmt.Errorf("invalid segment header in segment %d: %w", i, errInvalidSize)
                }
                // Verify magic number.
                if m := binary.BigEndian.Uint32(b.Range(0, MagicChunksSize)); m != MagicChunks {
                        return nil, fmt.Errorf("invalid magic number %x", m)
                }

                // Verify chunk format version.
                if v := int(b.Range(MagicChunksSize, MagicChunksSize+ChunksFormatVersionSize)[0]); v != chunksFormatV1 {
                        return nil, fmt.Errorf("invalid chunk format version %d", v)
                }
                cr.size += int64(b.Len())
        }
        return &cr, nil
}

// NewDirReader returns a new Reader against sequentially numbered files in the
// given directory.
func NewDirReader(dir string, pool chunkenc.Pool) (*Reader, error) {
        files, err := sequenceFiles(dir)
        if err != nil {
                return nil, err
        }
        if pool == nil {
                pool = chunkenc.NewPool()
        }

        var (
                bs []ByteSlice
                cs []io.Closer
        )
        for _, fn := range files {
                f, err := fileutil.OpenMmapFile(fn)
                if err != nil {
                        return nil, tsdb_errors.NewMulti(
                                fmt.Errorf("mmap files: %w", err),
                                tsdb_errors.CloseAll(cs),
                        ).Err()
                }
                cs = append(cs, f)
                bs = append(bs, realByteSlice(f.Bytes()))
        }

        reader, err := newReader(bs, cs, pool)
        if err != nil {
                return nil, tsdb_errors.NewMulti(
                        err,
                        tsdb_errors.CloseAll(cs),
                ).Err()
        }
        return reader, nil
}

func (s *Reader) Close() error {
        return tsdb_errors.CloseAll(s.cs)
}

// Size returns the size of the chunks.
func (s *Reader) Size() int64 {
        return s.size
}

// ChunkOrIterable returns a chunk from a given reference.
func (s *Reader) ChunkOrIterable(meta Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
        sgmIndex, chkStart := BlockChunkRef(meta.Ref).Unpack()

        if sgmIndex >= len(s.bs) {
                return nil, nil, fmt.Errorf("segment index %d out of range", sgmIndex)
        }

        sgmBytes := s.bs[sgmIndex]

        if chkStart+MaxChunkLengthFieldSize > sgmBytes.Len() {
                return nil, nil, fmt.Errorf("segment doesn't include enough bytes to read the chunk size data field - required:%v, available:%v", chkStart+MaxChunkLengthFieldSize, sgmBytes.Len())
        }
        // With the minimum chunk length this should never cause us reading
        // over the end of the slice.
        c := sgmBytes.Range(chkStart, chkStart+MaxChunkLengthFieldSize)
        chkDataLen, n := binary.Uvarint(c)
        if n <= 0 {
                return nil, nil, fmt.Errorf("reading chunk length failed with %d", n)
        }

        chkEncStart := chkStart + n
        chkEnd := chkEncStart + ChunkEncodingSize + int(chkDataLen) + crc32.Size
        chkDataStart := chkEncStart + ChunkEncodingSize
        chkDataEnd := chkEnd - crc32.Size

        if chkEnd > sgmBytes.Len() {
                return nil, nil, fmt.Errorf("segment doesn't include enough bytes to read the chunk - required:%v, available:%v", chkEnd, sgmBytes.Len())
        }

        sum := sgmBytes.Range(chkDataEnd, chkEnd)
        if err := checkCRC32(sgmBytes.Range(chkEncStart, chkDataEnd), sum); err != nil {
                return nil, nil, err
        }

        chkData := sgmBytes.Range(chkDataStart, chkDataEnd)
        chkEnc := sgmBytes.Range(chkEncStart, chkEncStart+ChunkEncodingSize)[0]
        chk, err := s.pool.Get(chunkenc.Encoding(chkEnc), chkData)
        return chk, nil, err
}

func nextSequenceFile(dir string) (string, int, error) {
        files, err := os.ReadDir(dir)
        if err != nil {
                return "", 0, err
        }

        i := uint64(0)
        for _, f := range files {
                j, err := strconv.ParseUint(f.Name(), 10, 64)
                if err != nil {
                        continue
                }
                // It is not necessary that we find the files in number order,
                // for example with '1000000' and '200000', '1000000' would come first.
                // Though this is a very very rare case, we check anyway for the max id.
                if j > i {
                        i = j
                }
        }
        return segmentFile(dir, int(i+1)), int(i + 1), nil
}

func segmentFile(baseDir string, index int) string {
        return filepath.Join(baseDir, fmt.Sprintf("%0.6d", index))
}

func sequenceFiles(dir string) ([]string, error) {
        files, err := os.ReadDir(dir)
        if err != nil {
                return nil, err
        }
        var res []string
        for _, fi := range files {
                if _, err := strconv.ParseUint(fi.Name(), 10, 64); err != nil {
                        continue
                }
                res = append(res, filepath.Join(dir, fi.Name()))
        }
        return res, nil
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunks

import (
        "bufio"
        "encoding/binary"
        "errors"
        "fmt"
        "hash"
        "io"
        "os"
        "path/filepath"
        "slices"
        "strconv"
        "sync"

        "github.com/dennwc/varint"
        "github.com/prometheus/client_golang/prometheus"
        "go.uber.org/atomic"

        "github.com/prometheus/prometheus/tsdb/chunkenc"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
)

// Head chunk file header fields constants.
const (
        // MagicHeadChunks is 4 bytes at the beginning of a head chunk file.
        MagicHeadChunks = 0x0130BC91

        headChunksFormatV1 = 1
)

// ErrChunkDiskMapperClosed returned by any method indicates
// that the ChunkDiskMapper was closed.
var ErrChunkDiskMapperClosed = errors.New("ChunkDiskMapper closed")

const (
        // MintMaxtSize is the size of the mint/maxt for head chunk file and chunks.
        MintMaxtSize = 8
        // SeriesRefSize is the size of series reference on disk.
        SeriesRefSize = 8
        // HeadChunkFileHeaderSize is the total size of the header for a head chunk file.
        HeadChunkFileHeaderSize = SegmentHeaderSize
        // MaxHeadChunkFileSize is the max size of a head chunk file.
        MaxHeadChunkFileSize = 128 * 1024 * 1024 // 128 MiB.
        // CRCSize is the size of crc32 sum on disk.
        CRCSize = 4
        // MaxHeadChunkMetaSize is the max size of an mmapped chunks minus the chunks data.
        // Max because the uvarint size can be smaller.
        MaxHeadChunkMetaSize = SeriesRefSize + 2*MintMaxtSize + ChunkEncodingSize + MaxChunkLengthFieldSize + CRCSize
        // MinWriteBufferSize is the minimum write buffer size allowed.
        MinWriteBufferSize = 64 * 1024 // 64KB.
        // MaxWriteBufferSize is the maximum write buffer size allowed.
        MaxWriteBufferSize = 8 * 1024 * 1024 // 8 MiB.
        // DefaultWriteBufferSize is the default write buffer size.
        DefaultWriteBufferSize = 4 * 1024 * 1024 // 4 MiB.
        // DefaultWriteQueueSize is the default size of the in-memory queue used before flushing chunks to the disk.
        // A value of 0 completely disables this feature.
        DefaultWriteQueueSize = 0
)

// ChunkDiskMapperRef represents the location of a head chunk on disk.
// The upper 4 bytes hold the index of the head chunk file and
// the lower 4 bytes hold the byte offset in the head chunk file where the chunk starts.
type ChunkDiskMapperRef uint64

func newChunkDiskMapperRef(seq, offset uint64) ChunkDiskMapperRef {
        return ChunkDiskMapperRef((seq << 32) | offset)
}

func (ref ChunkDiskMapperRef) Unpack() (seq, offset int) {
        seq = int(ref >> 32)
        offset = int((ref << 32) >> 32)
        return seq, offset
}

func (ref ChunkDiskMapperRef) GreaterThanOrEqualTo(r ChunkDiskMapperRef) bool {
        s1, o1 := ref.Unpack()
        s2, o2 := r.Unpack()
        return s1 > s2 || (s1 == s2 && o1 >= o2)
}

func (ref ChunkDiskMapperRef) GreaterThan(r ChunkDiskMapperRef) bool {
        s1, o1 := ref.Unpack()
        s2, o2 := r.Unpack()
        return s1 > s2 || (s1 == s2 && o1 > o2)
}

// CorruptionErr is an error that's returned when corruption is encountered.
type CorruptionErr struct {
        Dir       string
        FileIndex int
        Err       error
}

func (e *CorruptionErr) Error() string {
        return fmt.Errorf("corruption in head chunk file %s: %w", segmentFile(e.Dir, e.FileIndex), e.Err).Error()
}

func (e *CorruptionErr) Unwrap() error {
        return e.Err
}

// chunkPos keeps track of the position in the head chunk files.
// chunkPos is not thread-safe, a lock must be used to protect it.
type chunkPos struct {
        seq     uint64 // Index of chunk file.
        offset  uint64 // Offset within chunk file.
        cutFile bool   // When true then the next chunk will be written to a new file.
}

// getNextChunkRef takes a chunk and returns the chunk reference which will refer to it once it has been written.
// getNextChunkRef also decides whether a new file should be cut before writing this chunk, and it returns the decision via the second return value.
// The order of calling getNextChunkRef must be the order in which chunks are written to the disk.
func (f *chunkPos) getNextChunkRef(chk chunkenc.Chunk) (chkRef ChunkDiskMapperRef, cutFile bool) {
        chkLen := uint64(len(chk.Bytes()))
        bytesToWrite := f.bytesToWriteForChunk(chkLen)

        if f.shouldCutNewFile(bytesToWrite) {
                f.toNewFile()
                f.cutFile = false
                cutFile = true
        }

        chkOffset := f.offset
        f.offset += bytesToWrite

        return newChunkDiskMapperRef(f.seq, chkOffset), cutFile
}

// toNewFile updates the seq/offset position to point to the beginning of a new chunk file.
func (f *chunkPos) toNewFile() {
        f.seq++
        f.offset = SegmentHeaderSize
}

// cutFileOnNextChunk triggers that the next chunk will be written in to a new file.
// Not thread safe, a lock must be held when calling this.
func (f *chunkPos) cutFileOnNextChunk() {
        f.cutFile = true
}

// setSeq sets the sequence number of the head chunk file.
func (f *chunkPos) setSeq(seq uint64) {
        f.seq = seq
}

// shouldCutNewFile returns whether a new file should be cut based on the file size.
// Not thread safe, a lock must be held when calling this.
func (f *chunkPos) shouldCutNewFile(bytesToWrite uint64) bool {
        if f.cutFile {
                return true
        }

        return f.offset == 0 || // First head chunk file.
                f.offset+bytesToWrite > MaxHeadChunkFileSize // Exceeds the max head chunk file size.
}

// bytesToWriteForChunk returns the number of bytes that will need to be written for the given chunk size,
// including all meta data before and after the chunk data.
// Head chunk format: https://github.com/prometheus/prometheus/blob/main/tsdb/docs/format/head_chunks.md#chunk
func (f *chunkPos) bytesToWriteForChunk(chkLen uint64) uint64 {
        // Headers.
        bytes := uint64(SeriesRefSize) + 2*MintMaxtSize + ChunkEncodingSize

        // Size of chunk length encoded as uvarint.
        bytes += uint64(varint.UvarintSize(chkLen))

        // Chunk length.
        bytes += chkLen

        // crc32.
        bytes += CRCSize

        return bytes
}

// ChunkDiskMapper is for writing the Head block chunks to disk
// and access chunks via mmapped files.
type ChunkDiskMapper struct {
        /// Writer.
        dir             *os.File
        writeBufferSize int

        curFile         *os.File      // File being written to.
        curFileSequence int           // Index of current open file being appended to. 0 if no file is active.
        curFileOffset   atomic.Uint64 // Bytes written in current open file.
        curFileMaxt     int64         // Used for the size retention.

        // The values in evtlPos represent the file position which will eventually be
        // reached once the content of the write queue has been fully processed.
        evtlPosMtx sync.Mutex
        evtlPos    chunkPos

        byteBuf      [MaxHeadChunkMetaSize]byte // Buffer used to write the header of the chunk.
        chkWriter    *bufio.Writer              // Writer for the current open file.
        crc32        hash.Hash
        writePathMtx sync.Mutex

        /// Reader.
        // The int key in the map is the file number on the disk.
        mmappedChunkFiles map[int]*mmappedChunkFile // Contains the m-mapped files for each chunk file mapped with its index.
        closers           map[int]io.Closer         // Closers for resources behind the byte slices.
        readPathMtx       sync.RWMutex              // Mutex used to protect the above 2 maps.
        pool              chunkenc.Pool             // This is used when fetching a chunk from the disk to allocate a chunk.

        // Writer and Reader.
        // We flush chunks to disk in batches. Hence, we store them in this buffer
        // from which chunks are served till they are flushed and are ready for m-mapping.
        chunkBuffer *chunkBuffer

        // Whether the maxt field is set for all mmapped chunk files tracked within the mmappedChunkFiles map.
        // This is done after iterating through all the chunks in those files using the IterateAllChunks method.
        fileMaxtSet bool

        writeQueue *chunkWriteQueue

        closed bool
}

// mmappedChunkFile provides mmap access to an entire head chunks file that holds many chunks.
type mmappedChunkFile struct {
        byteSlice ByteSlice
        maxt      int64 // Max timestamp among all of this file's chunks.
}

// NewChunkDiskMapper returns a new ChunkDiskMapper against the given directory
// using the default head chunk file duration.
// NOTE: 'IterateAllChunks' method needs to be called at least once after creating ChunkDiskMapper
// to set the maxt of all files.
func NewChunkDiskMapper(reg prometheus.Registerer, dir string, pool chunkenc.Pool, writeBufferSize, writeQueueSize int) (*ChunkDiskMapper, error) {
        // Validate write buffer size.
        if writeBufferSize < MinWriteBufferSize || writeBufferSize > MaxWriteBufferSize {
                return nil, fmt.Errorf("ChunkDiskMapper write buffer size should be between %d and %d (actual: %d)", MinWriteBufferSize, MaxWriteBufferSize, writeBufferSize)
        }
        if writeBufferSize%1024 != 0 {
                return nil, fmt.Errorf("ChunkDiskMapper write buffer size should be a multiple of 1024 (actual: %d)", writeBufferSize)
        }

        if err := os.MkdirAll(dir, 0o777); err != nil {
                return nil, err
        }
        dirFile, err := fileutil.OpenDir(dir)
        if err != nil {
                return nil, err
        }

        m := &ChunkDiskMapper{
                dir:             dirFile,
                pool:            pool,
                writeBufferSize: writeBufferSize,
                crc32:           newCRC32(),
                chunkBuffer:     newChunkBuffer(),
        }

        if writeQueueSize > 0 {
                m.writeQueue = newChunkWriteQueue(reg, writeQueueSize, m.writeChunk)
        }

        if m.pool == nil {
                m.pool = chunkenc.NewPool()
        }

        return m, m.openMMapFiles()
}

// Chunk encodings for out-of-order chunks.
// These encodings must be only used by the Head block for its internal bookkeeping.
const (
        OutOfOrderMask = uint8(0b10000000)
)

func (cdm *ChunkDiskMapper) ApplyOutOfOrderMask(sourceEncoding chunkenc.Encoding) chunkenc.Encoding {
        enc := uint8(sourceEncoding) | OutOfOrderMask
        return chunkenc.Encoding(enc)
}

func (cdm *ChunkDiskMapper) IsOutOfOrderChunk(e chunkenc.Encoding) bool {
        return (uint8(e) & OutOfOrderMask) != 0
}

func (cdm *ChunkDiskMapper) RemoveMasks(sourceEncoding chunkenc.Encoding) chunkenc.Encoding {
        restored := uint8(sourceEncoding) & (^OutOfOrderMask)
        return chunkenc.Encoding(restored)
}

// openMMapFiles opens all files within dir for mmapping.
func (cdm *ChunkDiskMapper) openMMapFiles() (returnErr error) {
        cdm.mmappedChunkFiles = map[int]*mmappedChunkFile{}
        cdm.closers = map[int]io.Closer{}
        defer func() {
                if returnErr != nil {
                        returnErr = tsdb_errors.NewMulti(returnErr, closeAllFromMap(cdm.closers)).Err()

                        cdm.mmappedChunkFiles = nil
                        cdm.closers = nil
                }
        }()

        files, err := listChunkFiles(cdm.dir.Name())
        if err != nil {
                return err
        }

        files, err = repairLastChunkFile(files)
        if err != nil {
                return err
        }

        chkFileIndices := make([]int, 0, len(files))
        for seq, fn := range files {
                f, err := fileutil.OpenMmapFile(fn)
                if err != nil {
                        return fmt.Errorf("mmap files, file: %s: %w", fn, err)
                }
                cdm.closers[seq] = f
                cdm.mmappedChunkFiles[seq] = &mmappedChunkFile{byteSlice: realByteSlice(f.Bytes())}
                chkFileIndices = append(chkFileIndices, seq)
        }

        // Check for gaps in the files.
        slices.Sort(chkFileIndices)
        if len(chkFileIndices) == 0 {
                return nil
        }
        lastSeq := chkFileIndices[0]
        for _, seq := range chkFileIndices[1:] {
                if seq != lastSeq+1 {
                        return fmt.Errorf("found unsequential head chunk files %s (index: %d) and %s (index: %d)", files[lastSeq], lastSeq, files[seq], seq)
                }
                lastSeq = seq
        }

        for i, b := range cdm.mmappedChunkFiles {
                if b.byteSlice.Len() < HeadChunkFileHeaderSize {
                        return fmt.Errorf("%s: invalid head chunk file header: %w", files[i], errInvalidSize)
                }
                // Verify magic number.
                if m := binary.BigEndian.Uint32(b.byteSlice.Range(0, MagicChunksSize)); m != MagicHeadChunks {
                        return fmt.Errorf("%s: invalid magic number %x", files[i], m)
                }

                // Verify chunk format version.
                if v := int(b.byteSlice.Range(MagicChunksSize, MagicChunksSize+ChunksFormatVersionSize)[0]); v != chunksFormatV1 {
                        return fmt.Errorf("%s: invalid chunk format version %d", files[i], v)
                }
        }

        cdm.evtlPos.setSeq(uint64(lastSeq))

        return nil
}

func listChunkFiles(dir string) (map[int]string, error) {
        files, err := os.ReadDir(dir)
        if err != nil {
                return nil, err
        }
        res := map[int]string{}
        for _, fi := range files {
                seq, err := strconv.ParseUint(fi.Name(), 10, 64)
                if err != nil {
                        continue
                }
                res[int(seq)] = filepath.Join(dir, fi.Name())
        }

        return res, nil
}

// HardLinkChunkFiles creates hardlinks for chunk files from src to dst.
// It does nothing if src doesn't exist and ensures dst is created if not.
func HardLinkChunkFiles(src, dst string) error {
        _, err := os.Stat(src)
        if os.IsNotExist(err) {
                return nil
        }
        if err != nil {
                return fmt.Errorf("check source chunks dir: %w", err)
        }
        if err := os.MkdirAll(dst, 0o777); err != nil {
                return fmt.Errorf("set up destination chunks dir: %w", err)
        }
        files, err := listChunkFiles(src)
        if err != nil {
                return fmt.Errorf("list chunks: %w", err)
        }
        for _, filePath := range files {
                _, fileName := filepath.Split(filePath)
                err := os.Link(filepath.Join(src, fileName), filepath.Join(dst, fileName))
                if err != nil {
                        return fmt.Errorf("hardlink a chunk: %w", err)
                }
        }
        return nil
}

// repairLastChunkFile deletes the last file if it's empty.
// Because we don't fsync when creating these files, we could end
// up with an empty file at the end during an abrupt shutdown.
func repairLastChunkFile(files map[int]string) (_ map[int]string, returnErr error) {
        lastFile := -1
        for seq := range files {
                if seq > lastFile {
                        lastFile = seq
                }
        }

        if lastFile <= 0 {
                return files, nil
        }

        f, err := os.Open(files[lastFile])
        if err != nil {
                return files, fmt.Errorf("open file during last head chunk file repair: %w", err)
        }

        buf := make([]byte, MagicChunksSize)
        size, err := f.Read(buf)
        if err != nil && !errors.Is(err, io.EOF) {
                return files, fmt.Errorf("failed to read magic number during last head chunk file repair: %w", err)
        }
        if err := f.Close(); err != nil {
                return files, fmt.Errorf("close file during last head chunk file repair: %w", err)
        }

        // We either don't have enough bytes for the magic number or the magic number is 0.
        // NOTE: we should not check for wrong magic number here because that error
        // needs to be sent up the function called (already done elsewhere)
        // for proper repair mechanism to happen in the Head.
        if size < MagicChunksSize || binary.BigEndian.Uint32(buf) == 0 {
                // Corrupt file, hence remove it.
                if err := os.RemoveAll(files[lastFile]); err != nil {
                        return files, fmt.Errorf("delete corrupted, empty head chunk file during last file repair: %w", err)
                }
                delete(files, lastFile)
        }

        return files, nil
}

// WriteChunk writes the chunk to disk.
// The returned chunk ref is the reference from where the chunk encoding starts for the chunk.
func (cdm *ChunkDiskMapper) WriteChunk(seriesRef HeadSeriesRef, mint, maxt int64, chk chunkenc.Chunk, isOOO bool, callback func(err error)) (chkRef ChunkDiskMapperRef) {
        // cdm.evtlPosMtx must be held to serialize the calls to cdm.evtlPos.getNextChunkRef() and the writing of the chunk (either with or without queue).
        cdm.evtlPosMtx.Lock()
        defer cdm.evtlPosMtx.Unlock()
        ref, cutFile := cdm.evtlPos.getNextChunkRef(chk)

        if cdm.writeQueue != nil {
                return cdm.writeChunkViaQueue(ref, isOOO, cutFile, seriesRef, mint, maxt, chk, callback)
        }

        err := cdm.writeChunk(seriesRef, mint, maxt, chk, ref, isOOO, cutFile)
        if callback != nil {
                callback(err)
        }

        return ref
}

func (cdm *ChunkDiskMapper) writeChunkViaQueue(ref ChunkDiskMapperRef, isOOO, cutFile bool, seriesRef HeadSeriesRef, mint, maxt int64, chk chunkenc.Chunk, callback func(err error)) (chkRef ChunkDiskMapperRef) {
        var err error
        if callback != nil {
                defer func() {
                        if err != nil {
                                callback(err)
                        }
                }()
        }

        err = cdm.writeQueue.addJob(chunkWriteJob{
                cutFile:   cutFile,
                seriesRef: seriesRef,
                mint:      mint,
                maxt:      maxt,
                chk:       chk,
                ref:       ref,
                isOOO:     isOOO,
                callback:  callback,
        })

        return ref
}

func (cdm *ChunkDiskMapper) writeChunk(seriesRef HeadSeriesRef, mint, maxt int64, chk chunkenc.Chunk, ref ChunkDiskMapperRef, isOOO, cutFile bool) (err error) {
        cdm.writePathMtx.Lock()
        defer cdm.writePathMtx.Unlock()

        if cdm.closed {
                return ErrChunkDiskMapperClosed
        }

        if cutFile {
                err := cdm.cutAndExpectRef(ref)
                if err != nil {
                        return err
                }
        }

        // if len(chk.Bytes())+MaxHeadChunkMetaSize >= writeBufferSize, it means that chunk >= the buffer size;
        // so no need to flush here, as we have to flush at the end (to not keep partial chunks in buffer).
        if len(chk.Bytes())+MaxHeadChunkMetaSize < cdm.writeBufferSize && cdm.chkWriter.Available() < MaxHeadChunkMetaSize+len(chk.Bytes()) {
                if err := cdm.flushBuffer(); err != nil {
                        return err
                }
        }

        cdm.crc32.Reset()
        bytesWritten := 0

        binary.BigEndian.PutUint64(cdm.byteBuf[bytesWritten:], uint64(seriesRef))
        bytesWritten += SeriesRefSize
        binary.BigEndian.PutUint64(cdm.byteBuf[bytesWritten:], uint64(mint))
        bytesWritten += MintMaxtSize
        binary.BigEndian.PutUint64(cdm.byteBuf[bytesWritten:], uint64(maxt))
        bytesWritten += MintMaxtSize
        enc := chk.Encoding()
        if isOOO {
                enc = cdm.ApplyOutOfOrderMask(enc)
        }
        cdm.byteBuf[bytesWritten] = byte(enc)
        bytesWritten += ChunkEncodingSize
        n := binary.PutUvarint(cdm.byteBuf[bytesWritten:], uint64(len(chk.Bytes())))
        bytesWritten += n

        if err := cdm.writeAndAppendToCRC32(cdm.byteBuf[:bytesWritten]); err != nil {
                return err
        }
        if err := cdm.writeAndAppendToCRC32(chk.Bytes()); err != nil {
                return err
        }
        if err := cdm.writeCRC32(); err != nil {
                return err
        }

        if maxt > cdm.curFileMaxt {
                cdm.curFileMaxt = maxt
        }

        cdm.chunkBuffer.put(ref, chk)

        if len(chk.Bytes())+MaxHeadChunkMetaSize >= cdm.writeBufferSize {
                // The chunk was bigger than the buffer itself.
                // Flushing to not keep partial chunks in buffer.
                if err := cdm.flushBuffer(); err != nil {
                        return err
                }
        }

        return nil
}

// CutNewFile makes that a new file will be created the next time a chunk is written.
func (cdm *ChunkDiskMapper) CutNewFile() {
        cdm.evtlPosMtx.Lock()
        defer cdm.evtlPosMtx.Unlock()

        cdm.evtlPos.cutFileOnNextChunk()
}

func (cdm *ChunkDiskMapper) IsQueueEmpty() bool {
        if cdm.writeQueue == nil {
                return true
        }

        return cdm.writeQueue.queueIsEmpty()
}

// cutAndExpectRef creates a new m-mapped file.
// The write lock should be held before calling this.
// It ensures that the position in the new file matches the given chunk reference, if not then it errors.
func (cdm *ChunkDiskMapper) cutAndExpectRef(chkRef ChunkDiskMapperRef) (err error) {
        seq, offset, err := cdm.cut()
        if err != nil {
                return err
        }

        if expSeq, expOffset := chkRef.Unpack(); seq != expSeq || offset != expOffset {
                return fmt.Errorf("expected newly cut file to have sequence:offset %d:%d, got %d:%d", expSeq, expOffset, seq, offset)
        }

        return nil
}

// cut creates a new m-mapped file. The write lock should be held before calling this.
// It returns the file sequence and the offset in that file to start writing chunks.
func (cdm *ChunkDiskMapper) cut() (seq, offset int, returnErr error) {
        // Sync current tail to disk and close.
        if err := cdm.finalizeCurFile(); err != nil {
                return 0, 0, err
        }

        offset, newFile, seq, err := cutSegmentFile(cdm.dir, MagicHeadChunks, headChunksFormatV1, HeadChunkFilePreallocationSize)
        if err != nil {
                return 0, 0, err
        }

        defer func() {
                // The file should not be closed if there is no error,
                // its kept open in the ChunkDiskMapper.
                if returnErr != nil {
                        returnErr = tsdb_errors.NewMulti(returnErr, newFile.Close()).Err()
                }
        }()

        cdm.curFileOffset.Store(uint64(offset))

        if cdm.curFile != nil {
                cdm.readPathMtx.Lock()
                cdm.mmappedChunkFiles[cdm.curFileSequence].maxt = cdm.curFileMaxt
                cdm.readPathMtx.Unlock()
        }

        mmapFile, err := fileutil.OpenMmapFileWithSize(newFile.Name(), MaxHeadChunkFileSize)
        if err != nil {
                return 0, 0, err
        }

        cdm.readPathMtx.Lock()
        cdm.curFileSequence = seq
        cdm.curFile = newFile
        if cdm.chkWriter != nil {
                cdm.chkWriter.Reset(newFile)
        } else {
                cdm.chkWriter = bufio.NewWriterSize(newFile, cdm.writeBufferSize)
        }

        cdm.closers[cdm.curFileSequence] = mmapFile
        cdm.mmappedChunkFiles[cdm.curFileSequence] = &mmappedChunkFile{byteSlice: realByteSlice(mmapFile.Bytes())}
        cdm.readPathMtx.Unlock()

        cdm.curFileMaxt = 0

        return seq, offset, nil
}

// finalizeCurFile writes all pending data to the current tail file,
// truncates its size, and closes it.
func (cdm *ChunkDiskMapper) finalizeCurFile() error {
        if cdm.curFile == nil {
                return nil
        }

        if err := cdm.flushBuffer(); err != nil {
                return err
        }

        if err := cdm.curFile.Sync(); err != nil {
                return err
        }

        return cdm.curFile.Close()
}

func (cdm *ChunkDiskMapper) write(b []byte) error {
        n, err := cdm.chkWriter.Write(b)
        cdm.curFileOffset.Add(uint64(n))
        return err
}

func (cdm *ChunkDiskMapper) writeAndAppendToCRC32(b []byte) error {
        if err := cdm.write(b); err != nil {
                return err
        }
        _, err := cdm.crc32.Write(b)
        return err
}

func (cdm *ChunkDiskMapper) writeCRC32() error {
        return cdm.write(cdm.crc32.Sum(cdm.byteBuf[:0]))
}

// flushBuffer flushes the current in-memory chunks.
// Assumes that writePathMtx is _write_ locked before calling this method.
func (cdm *ChunkDiskMapper) flushBuffer() error {
        if err := cdm.chkWriter.Flush(); err != nil {
                return err
        }
        cdm.chunkBuffer.clear()
        return nil
}

// Chunk returns a chunk from a given reference.
func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error) {
        cdm.readPathMtx.RLock()
        // We hold this read lock for the entire duration because if Close()
        // is called, the data in the byte slice will get corrupted as the mmapped
        // file will be closed.
        defer cdm.readPathMtx.RUnlock()

        if cdm.closed {
                return nil, ErrChunkDiskMapperClosed
        }

        if cdm.writeQueue != nil {
                chunk := cdm.writeQueue.get(ref)
                if chunk != nil {
                        return chunk, nil
                }
        }

        sgmIndex, chkStart := ref.Unpack()
        // We skip the series ref and the mint/maxt beforehand.
        chkStart += SeriesRefSize + (2 * MintMaxtSize)

        // If it is the current open file, then the chunks can be in the buffer too.
        if sgmIndex == cdm.curFileSequence {
                chunk := cdm.chunkBuffer.get(ref)
                if chunk != nil {
                        return chunk, nil
                }
        }

        mmapFile, ok := cdm.mmappedChunkFiles[sgmIndex]
        if !ok {
                if sgmIndex > cdm.curFileSequence {
                        return nil, &CorruptionErr{
                                Dir:       cdm.dir.Name(),
                                FileIndex: -1,
                                Err:       fmt.Errorf("head chunk file index %d more than current open file", sgmIndex),
                        }
                }
                return nil, &CorruptionErr{
                        Dir:       cdm.dir.Name(),
                        FileIndex: sgmIndex,
                        Err:       fmt.Errorf("head chunk file index %d does not exist on disk", sgmIndex),
                }
        }

        if chkStart+MaxChunkLengthFieldSize > mmapFile.byteSlice.Len() {
                return nil, &CorruptionErr{
                        Dir:       cdm.dir.Name(),
                        FileIndex: sgmIndex,
                        Err:       fmt.Errorf("head chunk file doesn't include enough bytes to read the chunk size data field - required:%v, available:%v", chkStart+MaxChunkLengthFieldSize, mmapFile.byteSlice.Len()),
                }
        }

        // Encoding.
        chkEnc := mmapFile.byteSlice.Range(chkStart, chkStart+ChunkEncodingSize)[0]
        sourceChkEnc := chunkenc.Encoding(chkEnc)
        // Extract the encoding from the byte. ChunkDiskMapper uses only the last 7 bits for the encoding.
        chkEnc = byte(cdm.RemoveMasks(sourceChkEnc))
        // Data length.
        // With the minimum chunk length this should never cause us reading
        // over the end of the slice.
        chkDataLenStart := chkStart + ChunkEncodingSize
        c := mmapFile.byteSlice.Range(chkDataLenStart, chkDataLenStart+MaxChunkLengthFieldSize)
        chkDataLen, n := binary.Uvarint(c)
        if n <= 0 {
                return nil, &CorruptionErr{
                        Dir:       cdm.dir.Name(),
                        FileIndex: sgmIndex,
                        Err:       fmt.Errorf("reading chunk length failed with %d", n),
                }
        }

        // Verify the chunk data end.
        chkDataEnd := chkDataLenStart + n + int(chkDataLen)
        if chkDataEnd > mmapFile.byteSlice.Len() {
                return nil, &CorruptionErr{
                        Dir:       cdm.dir.Name(),
                        FileIndex: sgmIndex,
                        Err:       fmt.Errorf("head chunk file doesn't include enough bytes to read the chunk - required:%v, available:%v", chkDataEnd, mmapFile.byteSlice.Len()),
                }
        }

        // Check the CRC.
        sum := mmapFile.byteSlice.Range(chkDataEnd, chkDataEnd+CRCSize)
        if err := checkCRC32(mmapFile.byteSlice.Range(chkStart-(SeriesRefSize+2*MintMaxtSize), chkDataEnd), sum); err != nil {
                return nil, &CorruptionErr{
                        Dir:       cdm.dir.Name(),
                        FileIndex: sgmIndex,
                        Err:       err,
                }
        }

        // The chunk data itself.
        chkData := mmapFile.byteSlice.Range(chkDataEnd-int(chkDataLen), chkDataEnd)

        // Make a copy of the chunk data to prevent a panic occurring because the returned
        // chunk data slice references an mmap-ed file which could be closed after the
        // function returns but while the chunk is still in use.
        chkDataCopy := make([]byte, len(chkData))
        copy(chkDataCopy, chkData)

        chk, err := cdm.pool.Get(chunkenc.Encoding(chkEnc), chkDataCopy)
        if err != nil {
                return nil, &CorruptionErr{
                        Dir:       cdm.dir.Name(),
                        FileIndex: sgmIndex,
                        Err:       err,
                }
        }
        return chk, nil
}

// IterateAllChunks iterates all mmappedChunkFiles (in order of head chunk file name/number) and all the chunks within it
// and runs the provided function with information about each chunk. It returns on the first error encountered.
// NOTE: This method needs to be called at least once after creating ChunkDiskMapper
// to set the maxt of all files.
func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding, isOOO bool) error) (err error) {
        cdm.writePathMtx.Lock()
        defer cdm.writePathMtx.Unlock()

        defer func() {
                cdm.fileMaxtSet = true
        }()

        // Iterate files in ascending order.
        segIDs := make([]int, 0, len(cdm.mmappedChunkFiles))
        for seg := range cdm.mmappedChunkFiles {
                segIDs = append(segIDs, seg)
        }
        slices.Sort(segIDs)
        for _, segID := range segIDs {
                mmapFile := cdm.mmappedChunkFiles[segID]
                fileEnd := mmapFile.byteSlice.Len()
                if segID == cdm.curFileSequence {
                        fileEnd = int(cdm.curFileSize())
                }
                idx := HeadChunkFileHeaderSize
                for idx < fileEnd {
                        if fileEnd-idx < MaxHeadChunkMetaSize {
                                // Check for all 0s which marks the end of the file.
                                allZeros := true
                                for _, b := range mmapFile.byteSlice.Range(idx, fileEnd) {
                                        if b != byte(0) {
                                                allZeros = false
                                                break
                                        }
                                }
                                if allZeros {
                                        // End of segment chunk file content.
                                        break
                                }
                                return &CorruptionErr{
                                        Dir:       cdm.dir.Name(),
                                        FileIndex: segID,
                                        Err: fmt.Errorf("head chunk file has some unread data, but doesn't include enough bytes to read the chunk header"+
                                                " - required:%v, available:%v, file:%d", idx+MaxHeadChunkMetaSize, fileEnd, segID),
                                }
                        }
                        chunkRef := newChunkDiskMapperRef(uint64(segID), uint64(idx))

                        startIdx := idx
                        seriesRef := HeadSeriesRef(binary.BigEndian.Uint64(mmapFile.byteSlice.Range(idx, idx+SeriesRefSize)))
                        idx += SeriesRefSize
                        mint := int64(binary.BigEndian.Uint64(mmapFile.byteSlice.Range(idx, idx+MintMaxtSize)))
                        idx += MintMaxtSize
                        maxt := int64(binary.BigEndian.Uint64(mmapFile.byteSlice.Range(idx, idx+MintMaxtSize)))
                        idx += MintMaxtSize

                        // We preallocate file to help with m-mapping (especially windows systems).
                        // As series ref always starts from 1, we assume it being 0 to be the end of the actual file data.
                        // We are not considering possible file corruption that can cause it to be 0.
                        // Additionally we are checking mint and maxt just to be sure.
                        if seriesRef == 0 && mint == 0 && maxt == 0 {
                                break
                        }

                        chkEnc := chunkenc.Encoding(mmapFile.byteSlice.Range(idx, idx+ChunkEncodingSize)[0])
                        idx += ChunkEncodingSize
                        dataLen, n := binary.Uvarint(mmapFile.byteSlice.Range(idx, idx+MaxChunkLengthFieldSize))
                        idx += n

                        numSamples := binary.BigEndian.Uint16(mmapFile.byteSlice.Range(idx, idx+2))
                        idx += int(dataLen) // Skip the data.

                        // In the beginning we only checked for the chunk meta size.
                        // Now that we have added the chunk data length, we check for sufficient bytes again.
                        if idx+CRCSize > fileEnd {
                                return &CorruptionErr{
                                        Dir:       cdm.dir.Name(),
                                        FileIndex: segID,
                                        Err:       fmt.Errorf("head chunk file doesn't include enough bytes to read the chunk header - required:%v, available:%v, file:%d", idx+CRCSize, fileEnd, segID),
                                }
                        }

                        // Check CRC.
                        sum := mmapFile.byteSlice.Range(idx, idx+CRCSize)
                        if err := checkCRC32(mmapFile.byteSlice.Range(startIdx, idx), sum); err != nil {
                                return &CorruptionErr{
                                        Dir:       cdm.dir.Name(),
                                        FileIndex: segID,
                                        Err:       err,
                                }
                        }
                        idx += CRCSize

                        if maxt > mmapFile.maxt {
                                mmapFile.maxt = maxt
                        }
                        isOOO := cdm.IsOutOfOrderChunk(chkEnc)
                        // Extract the encoding from the byte. ChunkDiskMapper uses only the last 7 bits for the encoding.
                        chkEnc = cdm.RemoveMasks(chkEnc)
                        if err := f(seriesRef, chunkRef, mint, maxt, numSamples, chkEnc, isOOO); err != nil {
                                var cerr *CorruptionErr
                                if errors.As(err, &cerr) {
                                        cerr.Dir = cdm.dir.Name()
                                        cerr.FileIndex = segID
                                        return cerr
                                }
                                return err
                        }
                }

                if idx > fileEnd {
                        // It should be equal to the slice length.
                        return &CorruptionErr{
                                Dir:       cdm.dir.Name(),
                                FileIndex: segID,
                                Err:       fmt.Errorf("head chunk file doesn't include enough bytes to read the last chunk data - required:%v, available:%v, file:%d", idx, fileEnd, segID),
                        }
                }
        }

        return nil
}

// Truncate deletes the head chunk files with numbers less than the given fileNo.
func (cdm *ChunkDiskMapper) Truncate(fileNo uint32) error {
        cdm.readPathMtx.RLock()

        // Sort the file indices, else if files deletion fails in between,
        // it can lead to unsequential files as the map is not sorted.
        chkFileIndices := make([]int, 0, len(cdm.mmappedChunkFiles))
        for seq := range cdm.mmappedChunkFiles {
                chkFileIndices = append(chkFileIndices, seq)
        }
        slices.Sort(chkFileIndices)

        var removedFiles []int
        for _, seq := range chkFileIndices {
                if seq == cdm.curFileSequence || uint32(seq) >= fileNo {
                        break
                }
                removedFiles = append(removedFiles, seq)
        }
        cdm.readPathMtx.RUnlock()

        errs := tsdb_errors.NewMulti()
        // Cut a new file only if the current file has some chunks.
        if cdm.curFileSize() > HeadChunkFileHeaderSize {
                // There is a known race condition here because between the check of curFileSize() and the call to CutNewFile()
                // a new file could already be cut, this is acceptable because it will simply result in an empty file which
                // won't do any harm.
                cdm.CutNewFile()
        }
        pendingDeletes, err := cdm.deleteFiles(removedFiles)
        errs.Add(err)

        if len(chkFileIndices) == len(removedFiles) {
                // All files were deleted. Reset the current sequence.
                cdm.evtlPosMtx.Lock()

                // We can safely reset the sequence only if the write queue is empty. If it's not empty,
                // then there may be a job in the queue that will create a new segment file with an ID
                // generated before the sequence reset.
                //
                // The queueIsEmpty() function must be called while holding the cdm.evtlPosMtx to avoid
                // a race condition with WriteChunk().
                if cdm.writeQueue == nil || cdm.writeQueue.queueIsEmpty() {
                        if err == nil {
                                cdm.evtlPos.setSeq(0)
                        } else {
                                // In case of error, set it to the last file number on the disk that was not deleted.
                                cdm.evtlPos.setSeq(uint64(pendingDeletes[len(pendingDeletes)-1]))
                        }
                }

                cdm.evtlPosMtx.Unlock()
        }

        return errs.Err()
}

// deleteFiles deletes the given file sequences in order of the sequence.
// In case of an error, it returns the sorted file sequences that were not deleted from the _disk_.
func (cdm *ChunkDiskMapper) deleteFiles(removedFiles []int) ([]int, error) {
        slices.Sort(removedFiles) // To delete them in order.
        cdm.readPathMtx.Lock()
        for _, seq := range removedFiles {
                if err := cdm.closers[seq].Close(); err != nil {
                        cdm.readPathMtx.Unlock()
                        return removedFiles, err
                }
                delete(cdm.mmappedChunkFiles, seq)
                delete(cdm.closers, seq)
        }
        cdm.readPathMtx.Unlock()

        // We actually delete the files separately to not block the readPathMtx for long.
        for i, seq := range removedFiles {
                if err := os.Remove(segmentFile(cdm.dir.Name(), seq)); err != nil {
                        return removedFiles[i:], err
                }
        }

        return nil, nil
}

// DeleteCorrupted deletes all the head chunk files after the one which had the corruption
// (including the corrupt file).
func (cdm *ChunkDiskMapper) DeleteCorrupted(originalErr error) error {
        var cerr *CorruptionErr
        if !errors.As(originalErr, &cerr) {
                return fmt.Errorf("cannot handle error: %w", originalErr)
        }

        // Delete all the head chunk files following the corrupt head chunk file.
        segs := []int{}
        cdm.readPathMtx.RLock()
        lastSeq := 0
        for seg := range cdm.mmappedChunkFiles {
                switch {
                case seg >= cerr.FileIndex:
                        segs = append(segs, seg)
                case seg > lastSeq:
                        lastSeq = seg
                }
        }
        cdm.readPathMtx.RUnlock()

        pendingDeletes, err := cdm.deleteFiles(segs)
        cdm.evtlPosMtx.Lock()
        if err == nil {
                cdm.evtlPos.setSeq(uint64(lastSeq))
        } else {
                // In case of error, set it to the last file number on the disk that was not deleted.
                cdm.evtlPos.setSeq(uint64(pendingDeletes[len(pendingDeletes)-1]))
        }
        cdm.evtlPosMtx.Unlock()

        return err
}

// Size returns the size of the chunk files.
func (cdm *ChunkDiskMapper) Size() (int64, error) {
        return fileutil.DirSize(cdm.dir.Name())
}

func (cdm *ChunkDiskMapper) curFileSize() uint64 {
        return cdm.curFileOffset.Load()
}

// Close closes all the open files in ChunkDiskMapper.
// It is not longer safe to access chunks from this struct after calling Close.
func (cdm *ChunkDiskMapper) Close() error {
        // Locking the eventual position lock blocks WriteChunk()
        cdm.evtlPosMtx.Lock()
        defer cdm.evtlPosMtx.Unlock()

        if cdm.writeQueue != nil {
                cdm.writeQueue.stop()
        }

        // 'WriteChunk' locks writePathMtx first and then readPathMtx for cutting head chunk file.
        // The lock order should not be reversed here else it can cause deadlocks.
        cdm.writePathMtx.Lock()
        defer cdm.writePathMtx.Unlock()
        cdm.readPathMtx.Lock()
        defer cdm.readPathMtx.Unlock()

        if cdm.closed {
                return nil
        }
        cdm.closed = true

        errs := tsdb_errors.NewMulti(
                closeAllFromMap(cdm.closers),
                cdm.finalizeCurFile(),
                cdm.dir.Close(),
        )
        cdm.mmappedChunkFiles = map[int]*mmappedChunkFile{}
        cdm.closers = map[int]io.Closer{}

        return errs.Err()
}

func closeAllFromMap(cs map[int]io.Closer) error {
        errs := tsdb_errors.NewMulti()
        for _, c := range cs {
                errs.Add(c.Close())
        }
        return errs.Err()
}

const inBufferShards = 128 // 128 is a randomly chosen number.

// chunkBuffer is a thread safe lookup table for chunks by their ref.
type chunkBuffer struct {
        inBufferChunks     [inBufferShards]map[ChunkDiskMapperRef]chunkenc.Chunk
        inBufferChunksMtxs [inBufferShards]sync.RWMutex
}

func newChunkBuffer() *chunkBuffer {
        cb := &chunkBuffer{}
        for i := 0; i < inBufferShards; i++ {
                cb.inBufferChunks[i] = make(map[ChunkDiskMapperRef]chunkenc.Chunk)
        }
        return cb
}

func (cb *chunkBuffer) put(ref ChunkDiskMapperRef, chk chunkenc.Chunk) {
        shardIdx := ref % inBufferShards

        cb.inBufferChunksMtxs[shardIdx].Lock()
        cb.inBufferChunks[shardIdx][ref] = chk
        cb.inBufferChunksMtxs[shardIdx].Unlock()
}

func (cb *chunkBuffer) get(ref ChunkDiskMapperRef) chunkenc.Chunk {
        shardIdx := ref % inBufferShards

        cb.inBufferChunksMtxs[shardIdx].RLock()
        defer cb.inBufferChunksMtxs[shardIdx].RUnlock()

        return cb.inBufferChunks[shardIdx][ref]
}

func (cb *chunkBuffer) clear() {
        for i := 0; i < inBufferShards; i++ {
                cb.inBufferChunksMtxs[i].Lock()
                cb.inBufferChunks[i] = make(map[ChunkDiskMapperRef]chunkenc.Chunk)
                cb.inBufferChunksMtxs[i].Unlock()
        }
}

// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunks

import "sync"

// writeJobQueue is similar to buffered channel of chunkWriteJob, but manages its own buffers
// to avoid using a lot of memory when it's empty. It does that by storing elements into segments
// of equal size (segmentSize). When segment is not used anymore, reference to it are removed,
// so it can be treated as a garbage.
type writeJobQueue struct {
        maxSize     int
        segmentSize int

        mtx            sync.Mutex            // protects all following variables
        pushed, popped *sync.Cond            // signalled when something is pushed into the queue or popped from it
        first, last    *writeJobQueueSegment // pointer to first and last segment, if any
        size           int                   // total size of the queue
        closed         bool                  // after closing the queue, nothing can be pushed to it
}

type writeJobQueueSegment struct {
        segment             []chunkWriteJob
        nextRead, nextWrite int                   // index of next read and next write in this segment.
        nextSegment         *writeJobQueueSegment // next segment, if any
}

func newWriteJobQueue(maxSize, segmentSize int) *writeJobQueue {
        if maxSize <= 0 || segmentSize <= 0 {
                panic("invalid queue")
        }

        q := &writeJobQueue{
                maxSize:     maxSize,
                segmentSize: segmentSize,
        }

        q.pushed = sync.NewCond(&q.mtx)
        q.popped = sync.NewCond(&q.mtx)
        return q
}

func (q *writeJobQueue) close() {
        q.mtx.Lock()
        defer q.mtx.Unlock()

        q.closed = true

        // Unblock all blocked goroutines.
        q.pushed.Broadcast()
        q.popped.Broadcast()
}

// push blocks until there is space available in the queue, and then adds job to the queue.
// If queue is closed or gets closed while waiting for space, push returns false.
func (q *writeJobQueue) push(job chunkWriteJob) bool {
        q.mtx.Lock()
        defer q.mtx.Unlock()

        // Wait until queue has more space or is closed.
        for !q.closed && q.size >= q.maxSize {
                q.popped.Wait()
        }

        if q.closed {
                return false
        }

        // Check if this segment has more space for writing, and create new one if not.
        if q.last == nil || q.last.nextWrite >= q.segmentSize {
                prevLast := q.last
                q.last = &writeJobQueueSegment{
                        segment: make([]chunkWriteJob, q.segmentSize),
                }

                if prevLast != nil {
                        prevLast.nextSegment = q.last
                }
                if q.first == nil {
                        q.first = q.last
                }
        }

        q.last.segment[q.last.nextWrite] = job
        q.last.nextWrite++
        q.size++
        q.pushed.Signal()
        return true
}

// pop returns first job from the queue, and true.
// If queue is empty, pop blocks until there is a job (returns true), or until queue is closed (returns false).
// If queue was already closed, pop first returns all remaining elements from the queue (with true value), and only then returns false.
func (q *writeJobQueue) pop() (chunkWriteJob, bool) {
        q.mtx.Lock()
        defer q.mtx.Unlock()

        // wait until something is pushed to the queue, or queue is closed.
        for q.size == 0 {
                if q.closed {
                        return chunkWriteJob{}, false
                }

                q.pushed.Wait()
        }

        res := q.first.segment[q.first.nextRead]
        q.first.segment[q.first.nextRead] = chunkWriteJob{} // clear just-read element
        q.first.nextRead++
        q.size--

        // If we have read all possible elements from first segment, we can drop it.
        if q.first.nextRead >= q.segmentSize {
                q.first = q.first.nextSegment
                if q.first == nil {
                        q.last = nil
                }
        }

        q.popped.Signal()
        return res, true
}

// length returns number of all jobs in the queue.
func (q *writeJobQueue) length() int {
        q.mtx.Lock()
        defer q.mtx.Unlock()

        return q.size
}

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package chunks

import (
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
)

type Samples interface {
        Get(i int) Sample
        Len() int
}

type Sample interface {
        T() int64
        F() float64
        H() *histogram.Histogram
        FH() *histogram.FloatHistogram
        Type() chunkenc.ValueType
}

type SampleSlice []Sample

func (s SampleSlice) Get(i int) Sample { return s[i] }
func (s SampleSlice) Len() int         { return len(s) }

type sample struct {
        t  int64
        f  float64
        h  *histogram.Histogram
        fh *histogram.FloatHistogram
}

func (s sample) T() int64 {
        return s.t
}

func (s sample) F() float64 {
        return s.f
}

func (s sample) H() *histogram.Histogram {
        return s.h
}

func (s sample) FH() *histogram.FloatHistogram {
        return s.fh
}

func (s sample) Type() chunkenc.ValueType {
        switch {
        case s.h != nil:
                return chunkenc.ValHistogram
        case s.fh != nil:
                return chunkenc.ValFloatHistogram
        default:
                return chunkenc.ValFloat
        }
}

// GenerateSamples starting at start and counting up numSamples.
func GenerateSamples(start, numSamples int) []Sample {
        return generateSamples(start, numSamples, func(i int) Sample {
                return sample{
                        t: int64(i),
                        f: float64(i),
                }
        })
}

func generateSamples(start, numSamples int, gen func(int) Sample) []Sample {
        samples := make([]Sample, 0, numSamples)
        for i := start; i < start+numSamples; i++ {
                samples = append(samples, gen(i))
        }
        return samples
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "crypto/rand"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "slices"
        "time"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/oklog/ulid"
        "github.com/prometheus/client_golang/prometheus"

        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
        "github.com/prometheus/prometheus/tsdb/index"
        "github.com/prometheus/prometheus/tsdb/tombstones"
)

// ExponentialBlockRanges returns the time ranges based on the stepSize.
func ExponentialBlockRanges(minSize int64, steps, stepSize int) []int64 {
        ranges := make([]int64, 0, steps)
        curRange := minSize
        for i := 0; i < steps; i++ {
                ranges = append(ranges, curRange)
                curRange *= int64(stepSize)
        }

        return ranges
}

// Compactor provides compaction against an underlying storage
// of time series data.
type Compactor interface {
        // Plan returns a set of directories that can be compacted concurrently.
        // The directories can be overlapping.
        // Results returned when compactions are in progress are undefined.
        Plan(dir string) ([]string, error)

        // Write persists one or more Blocks into a directory.
        // No Block is written when resulting Block has 0 samples and returns an empty slice.
        // Prometheus always return one or no block. The interface allows returning more than one
        // block for downstream users to experiment with compactor.
        Write(dest string, b BlockReader, mint, maxt int64, base *BlockMeta) ([]ulid.ULID, error)

        // Compact runs compaction against the provided directories. Must
        // only be called concurrently with results of Plan().
        // Can optionally pass a list of already open blocks,
        // to avoid having to reopen them.
        // Prometheus always return one or no block. The interface allows returning more than one
        // block for downstream users to experiment with compactor.
        // When one resulting Block has 0 samples
        //  * No block is written.
        //  * The source dirs are marked Deletable.
        //  * Block is not included in the result.
        Compact(dest string, dirs []string, open []*Block) ([]ulid.ULID, error)
}

// LeveledCompactor implements the Compactor interface.
type LeveledCompactor struct {
        metrics                     *CompactorMetrics
        logger                      log.Logger
        ranges                      []int64
        chunkPool                   chunkenc.Pool
        ctx                         context.Context
        maxBlockChunkSegmentSize    int64
        mergeFunc                   storage.VerticalChunkSeriesMergeFunc
        postingsEncoder             index.PostingsEncoder
        enableOverlappingCompaction bool
}

type CompactorMetrics struct {
        Ran               prometheus.Counter
        PopulatingBlocks  prometheus.Gauge
        OverlappingBlocks prometheus.Counter
        Duration          prometheus.Histogram
        ChunkSize         prometheus.Histogram
        ChunkSamples      prometheus.Histogram
        ChunkRange        prometheus.Histogram
}

// NewCompactorMetrics initializes metrics for Compactor.
func NewCompactorMetrics(r prometheus.Registerer) *CompactorMetrics {
        m := &CompactorMetrics{}

        m.Ran = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "prometheus_tsdb_compactions_total",
                Help: "Total number of compactions that were executed for the partition.",
        })
        m.PopulatingBlocks = prometheus.NewGauge(prometheus.GaugeOpts{
                Name: "prometheus_tsdb_compaction_populating_block",
                Help: "Set to 1 when a block is currently being written to the disk.",
        })
        m.OverlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "prometheus_tsdb_vertical_compactions_total",
                Help: "Total number of compactions done on overlapping blocks.",
        })
        m.Duration = prometheus.NewHistogram(prometheus.HistogramOpts{
                Name:                            "prometheus_tsdb_compaction_duration_seconds",
                Help:                            "Duration of compaction runs",
                Buckets:                         prometheus.ExponentialBuckets(1, 2, 14),
                NativeHistogramBucketFactor:     1.1,
                NativeHistogramMaxBucketNumber:  100,
                NativeHistogramMinResetDuration: 1 * time.Hour,
        })
        m.ChunkSize = prometheus.NewHistogram(prometheus.HistogramOpts{
                Name:    "prometheus_tsdb_compaction_chunk_size_bytes",
                Help:    "Final size of chunks on their first compaction",
                Buckets: prometheus.ExponentialBuckets(32, 1.5, 12),
        })
        m.ChunkSamples = prometheus.NewHistogram(prometheus.HistogramOpts{
                Name:    "prometheus_tsdb_compaction_chunk_samples",
                Help:    "Final number of samples on their first compaction",
                Buckets: prometheus.ExponentialBuckets(4, 1.5, 12),
        })
        m.ChunkRange = prometheus.NewHistogram(prometheus.HistogramOpts{
                Name:    "prometheus_tsdb_compaction_chunk_range_seconds",
                Help:    "Final time range of chunks on their first compaction",
                Buckets: prometheus.ExponentialBuckets(100, 4, 10),
        })

        if r != nil {
                r.MustRegister(
                        m.Ran,
                        m.PopulatingBlocks,
                        m.OverlappingBlocks,
                        m.Duration,
                        m.ChunkRange,
                        m.ChunkSamples,
                        m.ChunkSize,
                )
        }
        return m
}

type LeveledCompactorOptions struct {
        // PE specifies the postings encoder. It is called when compactor is writing out the postings for a label name/value pair during compaction.
        // If it is nil then the default encoder is used. At the moment that is the "raw" encoder. See index.EncodePostingsRaw for more.
        PE index.PostingsEncoder
        // MaxBlockChunkSegmentSize is the max block chunk segment size. If it is 0 then the default chunks.DefaultChunkSegmentSize is used.
        MaxBlockChunkSegmentSize int64
        // MergeFunc is used for merging series together in vertical compaction. By default storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge) is used.
        MergeFunc storage.VerticalChunkSeriesMergeFunc
        // EnableOverlappingCompaction enables compaction of overlapping blocks. In Prometheus it is always enabled.
        // It is useful for downstream projects like Mimir, Cortex, Thanos where they have a separate component that does compaction.
        EnableOverlappingCompaction bool
}

func NewLeveledCompactorWithChunkSize(ctx context.Context, r prometheus.Registerer, l log.Logger, ranges []int64, pool chunkenc.Pool, maxBlockChunkSegmentSize int64, mergeFunc storage.VerticalChunkSeriesMergeFunc) (*LeveledCompactor, error) {
        return NewLeveledCompactorWithOptions(ctx, r, l, ranges, pool, LeveledCompactorOptions{
                MaxBlockChunkSegmentSize:    maxBlockChunkSegmentSize,
                MergeFunc:                   mergeFunc,
                EnableOverlappingCompaction: true,
        })
}

func NewLeveledCompactor(ctx context.Context, r prometheus.Registerer, l log.Logger, ranges []int64, pool chunkenc.Pool, mergeFunc storage.VerticalChunkSeriesMergeFunc) (*LeveledCompactor, error) {
        return NewLeveledCompactorWithOptions(ctx, r, l, ranges, pool, LeveledCompactorOptions{
                MergeFunc:                   mergeFunc,
                EnableOverlappingCompaction: true,
        })
}

func NewLeveledCompactorWithOptions(ctx context.Context, r prometheus.Registerer, l log.Logger, ranges []int64, pool chunkenc.Pool, opts LeveledCompactorOptions) (*LeveledCompactor, error) {
        if len(ranges) == 0 {
                return nil, fmt.Errorf("at least one range must be provided")
        }
        if pool == nil {
                pool = chunkenc.NewPool()
        }
        if l == nil {
                l = log.NewNopLogger()
        }
        mergeFunc := opts.MergeFunc
        if mergeFunc == nil {
                mergeFunc = storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)
        }
        maxBlockChunkSegmentSize := opts.MaxBlockChunkSegmentSize
        if maxBlockChunkSegmentSize == 0 {
                maxBlockChunkSegmentSize = chunks.DefaultChunkSegmentSize
        }
        pe := opts.PE
        if pe == nil {
                pe = index.EncodePostingsRaw
        }
        return &LeveledCompactor{
                ranges:                      ranges,
                chunkPool:                   pool,
                logger:                      l,
                metrics:                     NewCompactorMetrics(r),
                ctx:                         ctx,
                maxBlockChunkSegmentSize:    maxBlockChunkSegmentSize,
                mergeFunc:                   mergeFunc,
                postingsEncoder:             pe,
                enableOverlappingCompaction: opts.EnableOverlappingCompaction,
        }, nil
}

type dirMeta struct {
        dir  string
        meta *BlockMeta
}

// Plan returns a list of compactable blocks in the provided directory.
func (c *LeveledCompactor) Plan(dir string) ([]string, error) {
        dirs, err := blockDirs(dir)
        if err != nil {
                return nil, err
        }
        if len(dirs) < 1 {
                return nil, nil
        }

        var dms []dirMeta
        for _, dir := range dirs {
                meta, _, err := readMetaFile(dir)
                if err != nil {
                        return nil, err
                }
                dms = append(dms, dirMeta{dir, meta})
        }
        return c.plan(dms)
}

func (c *LeveledCompactor) plan(dms []dirMeta) ([]string, error) {
        slices.SortFunc(dms, func(a, b dirMeta) int {
                switch {
                case a.meta.MinTime < b.meta.MinTime:
                        return -1
                case a.meta.MinTime > b.meta.MinTime:
                        return 1
                default:
                        return 0
                }
        })

        res := c.selectOverlappingDirs(dms)
        if len(res) > 0 {
                return res, nil
        }
        // No overlapping blocks, do compaction the usual way.
        // We do not include a recently created block with max(minTime), so the block which was just created from WAL.
        // This gives users a window of a full block size to piece-wise backup new data without having to care about data overlap.
        dms = dms[:len(dms)-1]

        for _, dm := range c.selectDirs(dms) {
                res = append(res, dm.dir)
        }
        if len(res) > 0 {
                return res, nil
        }

        // Compact any blocks with big enough time range that have >5% tombstones.
        for i := len(dms) - 1; i >= 0; i-- {
                meta := dms[i].meta
                if meta.MaxTime-meta.MinTime < c.ranges[len(c.ranges)/2] {
                        // If the block is entirely deleted, then we don't care about the block being big enough.
                        // TODO: This is assuming a single tombstone is for a distinct series, which might not be true.
                        if meta.Stats.NumTombstones > 0 && meta.Stats.NumTombstones >= meta.Stats.NumSeries {
                                return []string{dms[i].dir}, nil
                        }
                        break
                }
                if float64(meta.Stats.NumTombstones)/float64(meta.Stats.NumSeries+1) > 0.05 {
                        return []string{dms[i].dir}, nil
                }
        }

        return nil, nil
}

// selectDirs returns the dir metas that should be compacted into a single new block.
// If only a single block range is configured, the result is always nil.
func (c *LeveledCompactor) selectDirs(ds []dirMeta) []dirMeta {
        if len(c.ranges) < 2 || len(ds) < 1 {
                return nil
        }

        highTime := ds[len(ds)-1].meta.MinTime

        for _, iv := range c.ranges[1:] {
                parts := splitByRange(ds, iv)
                if len(parts) == 0 {
                        continue
                }

        Outer:
                for _, p := range parts {
                        // Do not select the range if it has a block whose compaction failed.
                        for _, dm := range p {
                                if dm.meta.Compaction.Failed {
                                        continue Outer
                                }
                        }

                        mint := p[0].meta.MinTime
                        maxt := p[len(p)-1].meta.MaxTime
                        // Pick the range of blocks if it spans the full range (potentially with gaps)
                        // or is before the most recent block.
                        // This ensures we don't compact blocks prematurely when another one of the same
                        // size still fits in the range.
                        if (maxt-mint == iv || maxt <= highTime) && len(p) > 1 {
                                return p
                        }
                }
        }

        return nil
}

// selectOverlappingDirs returns all dirs with overlapping time ranges.
// It expects sorted input by mint and returns the overlapping dirs in the same order as received.
func (c *LeveledCompactor) selectOverlappingDirs(ds []dirMeta) []string {
        if !c.enableOverlappingCompaction {
                return nil
        }
        if len(ds) < 2 {
                return nil
        }
        var overlappingDirs []string
        globalMaxt := ds[0].meta.MaxTime
        for i, d := range ds[1:] {
                if d.meta.MinTime < globalMaxt {
                        if len(overlappingDirs) == 0 { // When it is the first overlap, need to add the last one as well.
                                overlappingDirs = append(overlappingDirs, ds[i].dir)
                        }
                        overlappingDirs = append(overlappingDirs, d.dir)
                } else if len(overlappingDirs) > 0 {
                        break
                }
                if d.meta.MaxTime > globalMaxt {
                        globalMaxt = d.meta.MaxTime
                }
        }
        return overlappingDirs
}

// splitByRange splits the directories by the time range. The range sequence starts at 0.
//
// For example, if we have blocks [0-10, 10-20, 50-60, 90-100] and the split range tr is 30
// it returns [0-10, 10-20], [50-60], [90-100].
func splitByRange(ds []dirMeta, tr int64) [][]dirMeta {
        var splitDirs [][]dirMeta

        for i := 0; i < len(ds); {
                var (
                        group []dirMeta
                        t0    int64
                        m     = ds[i].meta
                )
                // Compute start of aligned time range of size tr closest to the current block's start.
                if m.MinTime >= 0 {
                        t0 = tr * (m.MinTime / tr)
                } else {
                        t0 = tr * ((m.MinTime - tr + 1) / tr)
                }
                // Skip blocks that don't fall into the range. This can happen via mis-alignment or
                // by being a multiple of the intended range.
                if m.MaxTime > t0+tr {
                        i++
                        continue
                }

                // Add all dirs to the current group that are within [t0, t0+tr].
                for ; i < len(ds); i++ {
                        // Either the block falls into the next range or doesn't fit at all (checked above).
                        if ds[i].meta.MaxTime > t0+tr {
                                break
                        }
                        group = append(group, ds[i])
                }

                if len(group) > 0 {
                        splitDirs = append(splitDirs, group)
                }
        }

        return splitDirs
}

// CompactBlockMetas merges many block metas into one, combining its source blocks together
// and adjusting compaction level. Min/Max time of result block meta covers all input blocks.
func CompactBlockMetas(uid ulid.ULID, blocks ...*BlockMeta) *BlockMeta {
        res := &BlockMeta{
                ULID: uid,
        }

        sources := map[ulid.ULID]struct{}{}
        mint := blocks[0].MinTime
        maxt := blocks[0].MaxTime

        for _, b := range blocks {
                if b.MinTime < mint {
                        mint = b.MinTime
                }
                if b.MaxTime > maxt {
                        maxt = b.MaxTime
                }
                if b.Compaction.Level > res.Compaction.Level {
                        res.Compaction.Level = b.Compaction.Level
                }
                for _, s := range b.Compaction.Sources {
                        sources[s] = struct{}{}
                }
                res.Compaction.Parents = append(res.Compaction.Parents, BlockDesc{
                        ULID:    b.ULID,
                        MinTime: b.MinTime,
                        MaxTime: b.MaxTime,
                })
        }
        res.Compaction.Level++

        for s := range sources {
                res.Compaction.Sources = append(res.Compaction.Sources, s)
        }
        slices.SortFunc(res.Compaction.Sources, func(a, b ulid.ULID) int {
                return a.Compare(b)
        })

        res.MinTime = mint
        res.MaxTime = maxt
        return res
}

// Compact creates a new block in the compactor's directory from the blocks in the
// provided directories.
func (c *LeveledCompactor) Compact(dest string, dirs []string, open []*Block) ([]ulid.ULID, error) {
        return c.CompactWithBlockPopulator(dest, dirs, open, DefaultBlockPopulator{})
}

func (c *LeveledCompactor) CompactWithBlockPopulator(dest string, dirs []string, open []*Block, blockPopulator BlockPopulator) ([]ulid.ULID, error) {
        var (
                blocks []BlockReader
                bs     []*Block
                metas  []*BlockMeta
                uids   []string
        )
        start := time.Now()

        for _, d := range dirs {
                meta, _, err := readMetaFile(d)
                if err != nil {
                        return nil, err
                }

                var b *Block

                // Use already open blocks if we can, to avoid
                // having the index data in memory twice.
                for _, o := range open {
                        if meta.ULID == o.Meta().ULID {
                                b = o
                                break
                        }
                }

                if b == nil {
                        var err error
                        b, err = OpenBlock(c.logger, d, c.chunkPool)
                        if err != nil {
                                return nil, err
                        }
                        defer b.Close()
                }

                metas = append(metas, meta)
                blocks = append(blocks, b)
                bs = append(bs, b)
                uids = append(uids, meta.ULID.String())
        }

        uid := ulid.MustNew(ulid.Now(), rand.Reader)

        meta := CompactBlockMetas(uid, metas...)
        err := c.write(dest, meta, blockPopulator, blocks...)
        if err == nil {
                if meta.Stats.NumSamples == 0 {
                        for _, b := range bs {
                                b.meta.Compaction.Deletable = true
                                n, err := writeMetaFile(c.logger, b.dir, &b.meta)
                                if err != nil {
                                        level.Error(c.logger).Log(
                                                "msg", "Failed to write 'Deletable' to meta file after compaction",
                                                "ulid", b.meta.ULID,
                                        )
                                }
                                b.numBytesMeta = n
                        }
                        level.Info(c.logger).Log(
                                "msg", "compact blocks resulted in empty block",
                                "count", len(blocks),
                                "sources", fmt.Sprintf("%v", uids),
                                "duration", time.Since(start),
                        )
                        return nil, nil
                }

                level.Info(c.logger).Log(
                        "msg", "compact blocks",
                        "count", len(blocks),
                        "mint", meta.MinTime,
                        "maxt", meta.MaxTime,
                        "ulid", meta.ULID,
                        "sources", fmt.Sprintf("%v", uids),
                        "duration", time.Since(start),
                )
                return []ulid.ULID{uid}, nil
        }

        errs := tsdb_errors.NewMulti(err)
        if !errors.Is(err, context.Canceled) {
                for _, b := range bs {
                        if err := b.setCompactionFailed(); err != nil {
                                errs.Add(fmt.Errorf("setting compaction failed for block: %s: %w", b.Dir(), err))
                        }
                }
        }

        return nil, errs.Err()
}

func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64, base *BlockMeta) ([]ulid.ULID, error) {
        start := time.Now()

        uid := ulid.MustNew(ulid.Now(), rand.Reader)

        meta := &BlockMeta{
                ULID:    uid,
                MinTime: mint,
                MaxTime: maxt,
        }
        meta.Compaction.Level = 1
        meta.Compaction.Sources = []ulid.ULID{uid}

        if base != nil {
                meta.Compaction.Parents = []BlockDesc{
                        {ULID: base.ULID, MinTime: base.MinTime, MaxTime: base.MaxTime},
                }
                if base.Compaction.FromOutOfOrder() {
                        meta.Compaction.SetOutOfOrder()
                }
        }

        err := c.write(dest, meta, DefaultBlockPopulator{}, b)
        if err != nil {
                return nil, err
        }

        if meta.Stats.NumSamples == 0 {
                level.Info(c.logger).Log(
                        "msg", "write block resulted in empty block",
                        "mint", meta.MinTime,
                        "maxt", meta.MaxTime,
                        "duration", time.Since(start),
                )
                return nil, nil
        }

        level.Info(c.logger).Log(
                "msg", "write block",
                "mint", meta.MinTime,
                "maxt", meta.MaxTime,
                "ulid", meta.ULID,
                "duration", time.Since(start),
                "ooo", meta.Compaction.FromOutOfOrder(),
        )
        return []ulid.ULID{uid}, nil
}

// instrumentedChunkWriter is used for level 1 compactions to record statistics
// about compacted chunks.
type instrumentedChunkWriter struct {
        ChunkWriter

        size    prometheus.Histogram
        samples prometheus.Histogram
        trange  prometheus.Histogram
}

func (w *instrumentedChunkWriter) WriteChunks(chunks ...chunks.Meta) error {
        for _, c := range chunks {
                w.size.Observe(float64(len(c.Chunk.Bytes())))
                w.samples.Observe(float64(c.Chunk.NumSamples()))
                w.trange.Observe(float64(c.MaxTime - c.MinTime))
        }
        return w.ChunkWriter.WriteChunks(chunks...)
}

// write creates a new block that is the union of the provided blocks into dir.
func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blockPopulator BlockPopulator, blocks ...BlockReader) (err error) {
        dir := filepath.Join(dest, meta.ULID.String())
        tmp := dir + tmpForCreationBlockDirSuffix
        var closers []io.Closer
        defer func(t time.Time) {
                err = tsdb_errors.NewMulti(err, tsdb_errors.CloseAll(closers)).Err()

                // RemoveAll returns no error when tmp doesn't exist so it is safe to always run it.
                if err := os.RemoveAll(tmp); err != nil {
                        level.Error(c.logger).Log("msg", "removed tmp folder after failed compaction", "err", err.Error())
                }
                c.metrics.Ran.Inc()
                c.metrics.Duration.Observe(time.Since(t).Seconds())
        }(time.Now())

        if err = os.RemoveAll(tmp); err != nil {
                return err
        }

        if err = os.MkdirAll(tmp, 0o777); err != nil {
                return err
        }

        // Populate chunk and index files into temporary directory with
        // data of all blocks.
        var chunkw ChunkWriter

        chunkw, err = chunks.NewWriterWithSegSize(chunkDir(tmp), c.maxBlockChunkSegmentSize)
        if err != nil {
                return fmt.Errorf("open chunk writer: %w", err)
        }
        closers = append(closers, chunkw)
        // Record written chunk sizes on level 1 compactions.
        if meta.Compaction.Level == 1 {
                chunkw = &instrumentedChunkWriter{
                        ChunkWriter: chunkw,
                        size:        c.metrics.ChunkSize,
                        samples:     c.metrics.ChunkSamples,
                        trange:      c.metrics.ChunkRange,
                }
        }

        indexw, err := index.NewWriterWithEncoder(c.ctx, filepath.Join(tmp, indexFilename), c.postingsEncoder)
        if err != nil {
                return fmt.Errorf("open index writer: %w", err)
        }
        closers = append(closers, indexw)

        if err := blockPopulator.PopulateBlock(c.ctx, c.metrics, c.logger, c.chunkPool, c.mergeFunc, blocks, meta, indexw, chunkw, AllSortedPostings); err != nil {
                return fmt.Errorf("populate block: %w", err)
        }

        select {
        case <-c.ctx.Done():
                return c.ctx.Err()
        default:
        }

        // We are explicitly closing them here to check for error even
        // though these are covered under defer. This is because in Windows,
        // you cannot delete these unless they are closed and the defer is to
        // make sure they are closed if the function exits due to an error above.
        errs := tsdb_errors.NewMulti()
        for _, w := range closers {
                errs.Add(w.Close())
        }
        closers = closers[:0] // Avoid closing the writers twice in the defer.
        if errs.Err() != nil {
                return errs.Err()
        }

        // Populated block is empty, so exit early.
        if meta.Stats.NumSamples == 0 {
                return nil
        }

        if _, err = writeMetaFile(c.logger, tmp, meta); err != nil {
                return fmt.Errorf("write merged meta: %w", err)
        }

        // Create an empty tombstones file.
        if _, err := tombstones.WriteFile(c.logger, tmp, tombstones.NewMemTombstones()); err != nil {
                return fmt.Errorf("write new tombstones file: %w", err)
        }

        df, err := fileutil.OpenDir(tmp)
        if err != nil {
                return fmt.Errorf("open temporary block dir: %w", err)
        }
        defer func() {
                if df != nil {
                        df.Close()
                }
        }()

        if err := df.Sync(); err != nil {
                return fmt.Errorf("sync temporary dir file: %w", err)
        }

        // Close temp dir before rename block dir (for windows platform).
        if err = df.Close(); err != nil {
                return fmt.Errorf("close temporary dir: %w", err)
        }
        df = nil

        // Block successfully written, make it visible in destination dir by moving it from tmp one.
        if err := fileutil.Replace(tmp, dir); err != nil {
                return fmt.Errorf("rename block dir: %w", err)
        }

        return nil
}

type BlockPopulator interface {
        PopulateBlock(ctx context.Context, metrics *CompactorMetrics, logger log.Logger, chunkPool chunkenc.Pool, mergeFunc storage.VerticalChunkSeriesMergeFunc, blocks []BlockReader, meta *BlockMeta, indexw IndexWriter, chunkw ChunkWriter, postingsFunc IndexReaderPostingsFunc) error
}

// IndexReaderPostingsFunc is a function to get a sorted posting iterator from a given index reader.
type IndexReaderPostingsFunc func(ctx context.Context, reader IndexReader) index.Postings

// AllSortedPostings returns a sorted all posting iterator from the input index reader.
func AllSortedPostings(ctx context.Context, reader IndexReader) index.Postings {
        k, v := index.AllPostingsKey()
        all, err := reader.Postings(ctx, k, v)
        if err != nil {
                return index.ErrPostings(err)
        }
        return reader.SortedPostings(all)
}

type DefaultBlockPopulator struct{}

// PopulateBlock fills the index and chunk writers with new data gathered as the union
// of the provided blocks. It returns meta information for the new block.
// It expects sorted blocks input by mint.
func (c DefaultBlockPopulator) PopulateBlock(ctx context.Context, metrics *CompactorMetrics, logger log.Logger, chunkPool chunkenc.Pool, mergeFunc storage.VerticalChunkSeriesMergeFunc, blocks []BlockReader, meta *BlockMeta, indexw IndexWriter, chunkw ChunkWriter, postingsFunc IndexReaderPostingsFunc) (err error) {
        if len(blocks) == 0 {
                return errors.New("cannot populate block from no readers")
        }

        var (
                sets        []storage.ChunkSeriesSet
                symbols     index.StringIter
                closers     []io.Closer
                overlapping bool
        )
        defer func() {
                errs := tsdb_errors.NewMulti(err)
                if cerr := tsdb_errors.CloseAll(closers); cerr != nil {
                        errs.Add(fmt.Errorf("close: %w", cerr))
                }
                err = errs.Err()
                metrics.PopulatingBlocks.Set(0)
        }()
        metrics.PopulatingBlocks.Set(1)

        globalMaxt := blocks[0].Meta().MaxTime
        for i, b := range blocks {
                select {
                case <-ctx.Done():
                        return ctx.Err()
                default:
                }

                if !overlapping {
                        if i > 0 && b.Meta().MinTime < globalMaxt {
                                metrics.OverlappingBlocks.Inc()
                                overlapping = true
                                level.Info(logger).Log("msg", "Found overlapping blocks during compaction", "ulid", meta.ULID)
                        }
                        if b.Meta().MaxTime > globalMaxt {
                                globalMaxt = b.Meta().MaxTime
                        }
                }

                indexr, err := b.Index()
                if err != nil {
                        return fmt.Errorf("open index reader for block %+v: %w", b.Meta(), err)
                }
                closers = append(closers, indexr)

                chunkr, err := b.Chunks()
                if err != nil {
                        return fmt.Errorf("open chunk reader for block %+v: %w", b.Meta(), err)
                }
                closers = append(closers, chunkr)

                tombsr, err := b.Tombstones()
                if err != nil {
                        return fmt.Errorf("open tombstone reader for block %+v: %w", b.Meta(), err)
                }
                closers = append(closers, tombsr)

                postings := postingsFunc(ctx, indexr)
                // Blocks meta is half open: [min, max), so subtract 1 to ensure we don't hold samples with exact meta.MaxTime timestamp.
                sets = append(sets, NewBlockChunkSeriesSet(b.Meta().ULID, indexr, chunkr, tombsr, postings, meta.MinTime, meta.MaxTime-1, false))
                syms := indexr.Symbols()
                if i == 0 {
                        symbols = syms
                        continue
                }
                symbols = NewMergedStringIter(symbols, syms)
        }

        for symbols.Next() {
                if err := indexw.AddSymbol(symbols.At()); err != nil {
                        return fmt.Errorf("add symbol: %w", err)
                }
        }
        if err := symbols.Err(); err != nil {
                return fmt.Errorf("next symbol: %w", err)
        }

        var (
                ref      = storage.SeriesRef(0)
                chks     []chunks.Meta
                chksIter chunks.Iterator
        )

        set := sets[0]
        if len(sets) > 1 {
                // Merge series using specified chunk series merger.
                // The default one is the compacting series merger.
                set = storage.NewMergeChunkSeriesSet(sets, mergeFunc)
        }

        // Iterate over all sorted chunk series.
        for set.Next() {
                select {
                case <-ctx.Done():
                        return ctx.Err()
                default:
                }
                s := set.At()
                chksIter = s.Iterator(chksIter)
                chks = chks[:0]
                for chksIter.Next() {
                        // We are not iterating in a streaming way over chunks as
                        // it's more efficient to do bulk write for index and
                        // chunk file purposes.
                        chks = append(chks, chksIter.At())
                }
                if err := chksIter.Err(); err != nil {
                        return fmt.Errorf("chunk iter: %w", err)
                }

                // Skip series with all deleted chunks.
                if len(chks) == 0 {
                        continue
                }

                if err := chunkw.WriteChunks(chks...); err != nil {
                        return fmt.Errorf("write chunks: %w", err)
                }
                if err := indexw.AddSeries(ref, s.Labels(), chks...); err != nil {
                        return fmt.Errorf("add series: %w", err)
                }

                meta.Stats.NumChunks += uint64(len(chks))
                meta.Stats.NumSeries++
                for _, chk := range chks {
                        meta.Stats.NumSamples += uint64(chk.Chunk.NumSamples())
                }

                for _, chk := range chks {
                        if err := chunkPool.Put(chk.Chunk); err != nil {
                                return fmt.Errorf("put chunk: %w", err)
                        }
                }
                ref++
        }
        if err := set.Err(); err != nil {
                return fmt.Errorf("iterate compaction set: %w", err)
        }

        return nil
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package tsdb implements a time series storage for float64 sample data.
package tsdb

import (
        "context"
        "errors"
        "fmt"
        "io"
        "io/fs"
        "math"
        "os"
        "path/filepath"
        "slices"
        "strings"
        "sync"
        "time"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/oklog/ulid"
        "github.com/prometheus/client_golang/prometheus"
        "go.uber.org/atomic"
        "golang.org/x/sync/errgroup"

        "github.com/prometheus/prometheus/config"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
        _ "github.com/prometheus/prometheus/tsdb/goversion" // Load the package into main to make sure minimum Go version is met.
        "github.com/prometheus/prometheus/tsdb/tsdbutil"
        "github.com/prometheus/prometheus/tsdb/wlog"
)

const (
        // Default duration of a block in milliseconds.
        DefaultBlockDuration = int64(2 * time.Hour / time.Millisecond)

        // Block dir suffixes to make deletion and creation operations atomic.
        // We decided to do suffixes instead of creating meta.json as last (or delete as first) one,
        // because in error case you still can recover meta.json from the block content within local TSDB dir.
        // TODO(bwplotka): TSDB can end up with various .tmp files (e.g meta.json.tmp, WAL or segment tmp file. Think
        // about removing those too on start to save space. Currently only blocks tmp dirs are removed.
        tmpForDeletionBlockDirSuffix = ".tmp-for-deletion"
        tmpForCreationBlockDirSuffix = ".tmp-for-creation"
        // Pre-2.21 tmp dir suffix, used in clean-up functions.
        tmpLegacy = ".tmp"
)

// ErrNotReady is returned if the underlying storage is not ready yet.
var ErrNotReady = errors.New("TSDB not ready")

// DefaultOptions used for the DB. They are reasonable for setups using
// millisecond precision timestamps.
func DefaultOptions() *Options {
        return &Options{
                WALSegmentSize:              wlog.DefaultSegmentSize,
                MaxBlockChunkSegmentSize:    chunks.DefaultChunkSegmentSize,
                RetentionDuration:           int64(15 * 24 * time.Hour / time.Millisecond),
                MinBlockDuration:            DefaultBlockDuration,
                MaxBlockDuration:            DefaultBlockDuration,
                NoLockfile:                  false,
                SamplesPerChunk:             DefaultSamplesPerChunk,
                WALCompression:              wlog.CompressionNone,
                StripeSize:                  DefaultStripeSize,
                HeadChunksWriteBufferSize:   chunks.DefaultWriteBufferSize,
                IsolationDisabled:           defaultIsolationDisabled,
                HeadChunksWriteQueueSize:    chunks.DefaultWriteQueueSize,
                OutOfOrderCapMax:            DefaultOutOfOrderCapMax,
                EnableOverlappingCompaction: true,
                EnableSharding:              false,
        }
}

// Options of the DB storage.
type Options struct {
        // Segments (wal files) max size.
        // WALSegmentSize = 0, segment size is default size.
        // WALSegmentSize > 0, segment size is WALSegmentSize.
        // WALSegmentSize < 0, wal is disabled.
        WALSegmentSize int

        // MaxBlockChunkSegmentSize is the max size of block chunk segment files.
        // MaxBlockChunkSegmentSize = 0, chunk segment size is default size.
        // MaxBlockChunkSegmentSize > 0, chunk segment size is MaxBlockChunkSegmentSize.
        MaxBlockChunkSegmentSize int64

        // Duration of persisted data to keep.
        // Unit agnostic as long as unit is consistent with MinBlockDuration and MaxBlockDuration.
        // Typically it is in milliseconds.
        RetentionDuration int64

        // Maximum number of bytes in blocks to be retained.
        // 0 or less means disabled.
        // NOTE: For proper storage calculations need to consider
        // the size of the WAL folder which is not added when calculating
        // the current size of the database.
        MaxBytes int64

        // NoLockfile disables creation and consideration of a lock file.
        NoLockfile bool

        // WALCompression configures the compression type to use on records in the WAL.
        WALCompression wlog.CompressionType

        // Maximum number of CPUs that can simultaneously processes WAL replay.
        // If it is <=0, then GOMAXPROCS is used.
        WALReplayConcurrency int

        // StripeSize is the size in entries of the series hash map. Reducing the size will save memory but impact performance.
        StripeSize int

        // The timestamp range of head blocks after which they get persisted.
        // It's the minimum duration of any persisted block.
        // Unit agnostic as long as unit is consistent with RetentionDuration and MaxBlockDuration.
        // Typically it is in milliseconds.
        MinBlockDuration int64

        // The maximum timestamp range of compacted blocks.
        // Unit agnostic as long as unit is consistent with MinBlockDuration and RetentionDuration.
        // Typically it is in milliseconds.
        MaxBlockDuration int64

        // HeadChunksWriteBufferSize configures the write buffer size used by the head chunks mapper.
        HeadChunksWriteBufferSize int

        // HeadChunksWriteQueueSize configures the size of the chunk write queue used in the head chunks mapper.
        HeadChunksWriteQueueSize int

        // SamplesPerChunk configures the target number of samples per chunk.
        SamplesPerChunk int

        // SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
        // It is always a no-op in Prometheus and mainly meant for external users who import TSDB.
        SeriesLifecycleCallback SeriesLifecycleCallback

        // BlocksToDelete is a function which returns the blocks which can be deleted.
        // It is always the default time and size based retention in Prometheus and
        // mainly meant for external users who import TSDB.
        BlocksToDelete BlocksToDeleteFunc

        // Enables the in memory exemplar storage.
        EnableExemplarStorage bool

        // Enables the snapshot of in-memory chunks on shutdown. This makes restarts faster.
        EnableMemorySnapshotOnShutdown bool

        // MaxExemplars sets the size, in # of exemplars stored, of the single circular buffer used to store exemplars in memory.
        // See tsdb/exemplar.go, specifically the CircularExemplarStorage struct and it's constructor NewCircularExemplarStorage.
        MaxExemplars int64

        // Disables isolation between reads and in-flight appends.
        IsolationDisabled bool

        // EnableNativeHistograms enables the ingestion of native histograms.
        EnableNativeHistograms bool

        // OutOfOrderTimeWindow specifies how much out of order is allowed, if any.
        // This can change during run-time, so this value from here should only be used
        // while initialising.
        OutOfOrderTimeWindow int64

        // OutOfOrderCapMax is maximum capacity for OOO chunks (in samples).
        // If it is <=0, the default value is assumed.
        OutOfOrderCapMax int64

        // Compaction of overlapping blocks are allowed if EnableOverlappingCompaction is true.
        // This is an optional flag for overlapping blocks.
        // The reason why this flag exists is because there are various users of the TSDB
        // that do not want vertical compaction happening on ingest time. Instead,
        // they'd rather keep overlapping blocks and let another component do the overlapping compaction later.
        // For Prometheus, this will always be true.
        EnableOverlappingCompaction bool

        // EnableSharding enables query sharding support in TSDB.
        EnableSharding bool

        // NewCompactorFunc is a function that returns a TSDB compactor.
        NewCompactorFunc NewCompactorFunc

        // BlockQuerierFunc is a function to return storage.Querier from a BlockReader.
        BlockQuerierFunc BlockQuerierFunc

        // BlockChunkQuerierFunc is a function to return storage.ChunkQuerier from a BlockReader.
        BlockChunkQuerierFunc BlockChunkQuerierFunc
}

type NewCompactorFunc func(ctx context.Context, r prometheus.Registerer, l log.Logger, ranges []int64, pool chunkenc.Pool, opts *Options) (Compactor, error)

type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{}

type BlockQuerierFunc func(b BlockReader, mint, maxt int64) (storage.Querier, error)

type BlockChunkQuerierFunc func(b BlockReader, mint, maxt int64) (storage.ChunkQuerier, error)

// DB handles reads and writes of time series falling into
// a hashed partition of a seriedb.
type DB struct {
        dir    string
        locker *tsdbutil.DirLocker

        logger         log.Logger
        metrics        *dbMetrics
        opts           *Options
        chunkPool      chunkenc.Pool
        compactor      Compactor
        blocksToDelete BlocksToDeleteFunc

        // mtx must be held when modifying the general block layout or lastGarbageCollectedMmapRef.
        mtx    sync.RWMutex
        blocks []*Block

        // The last OOO chunk that was compacted and written to disk. New queriers must not read chunks less
        // than or equal to this reference, as these chunks could be garbage collected at any time.
        lastGarbageCollectedMmapRef chunks.ChunkDiskMapperRef

        head *Head

        compactc chan struct{}
        donec    chan struct{}
        stopc    chan struct{}

        // cmtx ensures that compactions and deletions don't run simultaneously.
        cmtx sync.Mutex

        // autoCompactMtx ensures that no compaction gets triggered while
        // changing the autoCompact var.
        autoCompactMtx sync.Mutex
        autoCompact    bool

        // Cancel a running compaction when a shutdown is initiated.
        compactCancel context.CancelFunc

        // oooWasEnabled is true if out of order support was enabled at least one time
        // during the time TSDB was up. In which case we need to keep supporting
        // out-of-order compaction and vertical queries.
        oooWasEnabled atomic.Bool

        writeNotified wlog.WriteNotified

        registerer prometheus.Registerer

        blockQuerierFunc BlockQuerierFunc

        blockChunkQuerierFunc BlockChunkQuerierFunc
}

type dbMetrics struct {
        loadedBlocks         prometheus.GaugeFunc
        symbolTableSize      prometheus.GaugeFunc
        reloads              prometheus.Counter
        reloadsFailed        prometheus.Counter
        compactionsFailed    prometheus.Counter
        compactionsTriggered prometheus.Counter
        compactionsSkipped   prometheus.Counter
        sizeRetentionCount   prometheus.Counter
        timeRetentionCount   prometheus.Counter
        startTime            prometheus.GaugeFunc
        tombCleanTimer       prometheus.Histogram
        blocksBytes          prometheus.Gauge
        maxBytes             prometheus.Gauge
        retentionDuration    prometheus.Gauge
}

func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
        m := &dbMetrics{}

        m.loadedBlocks = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                Name: "prometheus_tsdb_blocks_loaded",
                Help: "Number of currently loaded data blocks",
        }, func() float64 {
                db.mtx.RLock()
                defer db.mtx.RUnlock()
                return float64(len(db.blocks))
        })
        m.symbolTableSize = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                Name: "prometheus_tsdb_symbol_table_size_bytes",
                Help: "Size of symbol table in memory for loaded blocks",
        }, func() float64 {
                db.mtx.RLock()
                blocks := db.blocks
                db.mtx.RUnlock()
                symTblSize := uint64(0)
                for _, b := range blocks {
                        symTblSize += b.GetSymbolTableSize()
                }
                return float64(symTblSize)
        })
        m.reloads = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "prometheus_tsdb_reloads_total",
                Help: "Number of times the database reloaded block data from disk.",
        })
        m.reloadsFailed = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "prometheus_tsdb_reloads_failures_total",
                Help: "Number of times the database failed to reloadBlocks block data from disk.",
        })
        m.compactionsTriggered = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "prometheus_tsdb_compactions_triggered_total",
                Help: "Total number of triggered compactions for the partition.",
        })
        m.compactionsFailed = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "prometheus_tsdb_compactions_failed_total",
                Help: "Total number of compactions that failed for the partition.",
        })
        m.timeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "prometheus_tsdb_time_retentions_total",
                Help: "The number of times that blocks were deleted because the maximum time limit was exceeded.",
        })
        m.compactionsSkipped = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "prometheus_tsdb_compactions_skipped_total",
                Help: "Total number of skipped compactions due to disabled auto compaction.",
        })
        m.startTime = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                Name: "prometheus_tsdb_lowest_timestamp",
                Help: "Lowest timestamp value stored in the database. The unit is decided by the library consumer.",
        }, func() float64 {
                db.mtx.RLock()
                defer db.mtx.RUnlock()
                if len(db.blocks) == 0 {
                        return float64(db.head.MinTime())
                }
                return float64(db.blocks[0].meta.MinTime)
        })
        m.tombCleanTimer = prometheus.NewHistogram(prometheus.HistogramOpts{
                Name:                            "prometheus_tsdb_tombstone_cleanup_seconds",
                Help:                            "The time taken to recompact blocks to remove tombstones.",
                NativeHistogramBucketFactor:     1.1,
                NativeHistogramMaxBucketNumber:  100,
                NativeHistogramMinResetDuration: 1 * time.Hour,
        })
        m.blocksBytes = prometheus.NewGauge(prometheus.GaugeOpts{
                Name: "prometheus_tsdb_storage_blocks_bytes",
                Help: "The number of bytes that are currently used for local storage by all blocks.",
        })
        m.maxBytes = prometheus.NewGauge(prometheus.GaugeOpts{
                Name: "prometheus_tsdb_retention_limit_bytes",
                Help: "Max number of bytes to be retained in the tsdb blocks, configured 0 means disabled",
        })
        m.retentionDuration = prometheus.NewGauge(prometheus.GaugeOpts{
                Name: "prometheus_tsdb_retention_limit_seconds",
                Help: "How long to retain samples in storage.",
        })
        m.sizeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "prometheus_tsdb_size_retentions_total",
                Help: "The number of times that blocks were deleted because the maximum number of bytes was exceeded.",
        })

        if r != nil {
                r.MustRegister(
                        m.loadedBlocks,
                        m.symbolTableSize,
                        m.reloads,
                        m.reloadsFailed,
                        m.compactionsFailed,
                        m.compactionsTriggered,
                        m.compactionsSkipped,
                        m.sizeRetentionCount,
                        m.timeRetentionCount,
                        m.startTime,
                        m.tombCleanTimer,
                        m.blocksBytes,
                        m.maxBytes,
                        m.retentionDuration,
                )
        }
        return m
}

// DBStats contains statistics about the DB separated by component (eg. head).
// They are available before the DB has finished initializing.
type DBStats struct {
        Head *HeadStats
}

// NewDBStats returns a new DBStats object initialized using the
// new function from each component.
func NewDBStats() *DBStats {
        return &DBStats{
                Head: NewHeadStats(),
        }
}

// ErrClosed is returned when the db is closed.
var ErrClosed = errors.New("db already closed")

// DBReadOnly provides APIs for read only operations on a database.
// Current implementation doesn't support concurrency so
// all API calls should happen in the same go routine.
type DBReadOnly struct {
        logger     log.Logger
        dir        string
        sandboxDir string
        closers    []io.Closer
        closed     chan struct{}
}

// OpenDBReadOnly opens DB in the given directory for read only operations.
func OpenDBReadOnly(dir, sandboxDirRoot string, l log.Logger) (*DBReadOnly, error) {
        if _, err := os.Stat(dir); err != nil {
                return nil, fmt.Errorf("opening the db dir: %w", err)
        }

        if sandboxDirRoot == "" {
                sandboxDirRoot = dir
        }
        sandboxDir, err := os.MkdirTemp(sandboxDirRoot, "tmp_dbro_sandbox")
        if err != nil {
                return nil, fmt.Errorf("setting up sandbox dir: %w", err)
        }

        if l == nil {
                l = log.NewNopLogger()
        }

        return &DBReadOnly{
                logger:     l,
                dir:        dir,
                sandboxDir: sandboxDir,
                closed:     make(chan struct{}),
        }, nil
}

// FlushWAL creates a new block containing all data that's currently in the memory buffer/WAL.
// Samples that are in existing blocks will not be written to the new block.
// Note that if the read only database is running concurrently with a
// writable database then writing the WAL to the database directory can race.
func (db *DBReadOnly) FlushWAL(dir string) (returnErr error) {
        blockReaders, err := db.Blocks()
        if err != nil {
                return fmt.Errorf("read blocks: %w", err)
        }
        maxBlockTime := int64(math.MinInt64)
        if len(blockReaders) > 0 {
                maxBlockTime = blockReaders[len(blockReaders)-1].Meta().MaxTime
        }
        w, err := wlog.Open(db.logger, filepath.Join(db.dir, "wal"))
        if err != nil {
                return err
        }
        var wbl *wlog.WL
        wblDir := filepath.Join(db.dir, wlog.WblDirName)
        if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
                wbl, err = wlog.Open(db.logger, wblDir)
                if err != nil {
                        return err
                }
        }
        opts := DefaultHeadOptions()
        opts.ChunkDirRoot = db.dir
        head, err := NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
        if err != nil {
                return err
        }
        defer func() {
                errs := tsdb_errors.NewMulti(returnErr)
                if err := head.Close(); err != nil {
                        errs.Add(fmt.Errorf("closing Head: %w", err))
                }
                returnErr = errs.Err()
        }()
        // Set the min valid time for the ingested wal samples
        // to be no lower than the maxt of the last block.
        if err := head.Init(maxBlockTime); err != nil {
                return fmt.Errorf("read WAL: %w", err)
        }
        mint := head.MinTime()
        maxt := head.MaxTime()
        rh := NewRangeHead(head, mint, maxt)
        compactor, err := NewLeveledCompactor(
                context.Background(),
                nil,
                db.logger,
                ExponentialBlockRanges(DefaultOptions().MinBlockDuration, 3, 5),
                chunkenc.NewPool(), nil,
        )
        if err != nil {
                return fmt.Errorf("create leveled compactor: %w", err)
        }
        // Add +1 millisecond to block maxt because block intervals are half-open: [b.MinTime, b.MaxTime).
        // Because of this block intervals are always +1 than the total samples it includes.
        _, err = compactor.Write(dir, rh, mint, maxt+1, nil)
        if err != nil {
                return fmt.Errorf("writing WAL: %w", err)
        }
        return nil
}

func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQueryable, error) {
        select {
        case <-db.closed:
                return nil, ErrClosed
        default:
        }
        blockReaders, err := db.Blocks()
        if err != nil {
                return nil, err
        }
        blocks := make([]*Block, len(blockReaders))
        for i, b := range blockReaders {
                b, ok := b.(*Block)
                if !ok {
                        return nil, errors.New("unable to convert a read only block to a normal block")
                }
                blocks[i] = b
        }

        opts := DefaultHeadOptions()
        // Hard link the chunk files to a dir in db.sandboxDir in case the Head needs to truncate some of them
        // or cut new ones while replaying the WAL.
        // See https://github.com/prometheus/prometheus/issues/11618.
        err = chunks.HardLinkChunkFiles(mmappedChunksDir(db.dir), mmappedChunksDir(db.sandboxDir))
        if err != nil {
                return nil, err
        }
        opts.ChunkDirRoot = db.sandboxDir
        head, err := NewHead(nil, db.logger, nil, nil, opts, NewHeadStats())
        if err != nil {
                return nil, err
        }
        maxBlockTime := int64(math.MinInt64)
        if len(blocks) > 0 {
                maxBlockTime = blocks[len(blocks)-1].Meta().MaxTime
        }

        // Also add the WAL if the current blocks don't cover the requests time range.
        if maxBlockTime <= maxt {
                if err := head.Close(); err != nil {
                        return nil, err
                }
                w, err := wlog.Open(db.logger, filepath.Join(db.dir, "wal"))
                if err != nil {
                        return nil, err
                }
                var wbl *wlog.WL
                wblDir := filepath.Join(db.dir, wlog.WblDirName)
                if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
                        wbl, err = wlog.Open(db.logger, wblDir)
                        if err != nil {
                                return nil, err
                        }
                }
                opts := DefaultHeadOptions()
                opts.ChunkDirRoot = db.sandboxDir
                head, err = NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
                if err != nil {
                        return nil, err
                }
                // Set the min valid time for the ingested wal samples
                // to be no lower than the maxt of the last block.
                if err := head.Init(maxBlockTime); err != nil {
                        return nil, fmt.Errorf("read WAL: %w", err)
                }
                // Set the wal and the wbl to nil to disable related operations.
                // This is mainly to avoid blocking when closing the head.
                head.wal = nil
                head.wbl = nil
        }

        db.closers = append(db.closers, head)
        return &DB{
                dir:                   db.dir,
                logger:                db.logger,
                blocks:                blocks,
                head:                  head,
                blockQuerierFunc:      NewBlockQuerier,
                blockChunkQuerierFunc: NewBlockChunkQuerier,
        }, nil
}

// Querier loads the blocks and wal and returns a new querier over the data partition for the given time range.
// Current implementation doesn't support multiple Queriers.
func (db *DBReadOnly) Querier(mint, maxt int64) (storage.Querier, error) {
        q, err := db.loadDataAsQueryable(maxt)
        if err != nil {
                return nil, err
        }
        return q.Querier(mint, maxt)
}

// ChunkQuerier loads blocks and the wal and returns a new chunk querier over the data partition for the given time range.
// Current implementation doesn't support multiple ChunkQueriers.
func (db *DBReadOnly) ChunkQuerier(mint, maxt int64) (storage.ChunkQuerier, error) {
        q, err := db.loadDataAsQueryable(maxt)
        if err != nil {
                return nil, err
        }
        return q.ChunkQuerier(mint, maxt)
}

// Blocks returns a slice of block readers for persisted blocks.
func (db *DBReadOnly) Blocks() ([]BlockReader, error) {
        select {
        case <-db.closed:
                return nil, ErrClosed
        default:
        }
        loadable, corrupted, err := openBlocks(db.logger, db.dir, nil, nil)
        if err != nil {
                return nil, err
        }

        // Corrupted blocks that have been superseded by a loadable block can be safely ignored.
        for _, block := range loadable {
                for _, b := range block.Meta().Compaction.Parents {
                        delete(corrupted, b.ULID)
                }
        }
        if len(corrupted) > 0 {
                for _, b := range loadable {
                        if err := b.Close(); err != nil {
                                level.Warn(db.logger).Log("msg", "Closing block failed", "err", err, "block", b)
                        }
                }
                errs := tsdb_errors.NewMulti()
                for ulid, err := range corrupted {
                        if err != nil {
                                errs.Add(fmt.Errorf("corrupted block %s: %w", ulid.String(), err))
                        }
                }
                return nil, errs.Err()
        }

        if len(loadable) == 0 {
                return nil, nil
        }

        slices.SortFunc(loadable, func(a, b *Block) int {
                switch {
                case a.Meta().MinTime < b.Meta().MinTime:
                        return -1
                case a.Meta().MinTime > b.Meta().MinTime:
                        return 1
                default:
                        return 0
                }
        })

        blockMetas := make([]BlockMeta, 0, len(loadable))
        for _, b := range loadable {
                blockMetas = append(blockMetas, b.Meta())
        }
        if overlaps := OverlappingBlocks(blockMetas); len(overlaps) > 0 {
                level.Warn(db.logger).Log("msg", "Overlapping blocks found during opening", "detail", overlaps.String())
        }

        // Close all previously open readers and add the new ones to the cache.
        for _, closer := range db.closers {
                closer.Close()
        }

        blockClosers := make([]io.Closer, len(loadable))
        blockReaders := make([]BlockReader, len(loadable))
        for i, b := range loadable {
                blockClosers[i] = b
                blockReaders[i] = b
        }
        db.closers = blockClosers

        return blockReaders, nil
}

// LastBlockID returns the BlockID of latest block.
func (db *DBReadOnly) LastBlockID() (string, error) {
        entries, err := os.ReadDir(db.dir)
        if err != nil {
                return "", err
        }

        max := uint64(0)

        lastBlockID := ""

        for _, e := range entries {
                // Check if dir is a block dir or not.
                dirName := e.Name()
                ulidObj, err := ulid.ParseStrict(dirName)
                if err != nil {
                        continue // Not a block dir.
                }
                timestamp := ulidObj.Time()
                if timestamp > max {
                        max = timestamp
                        lastBlockID = dirName
                }
        }

        if lastBlockID == "" {
                return "", errors.New("no blocks found")
        }

        return lastBlockID, nil
}

// Block returns a block reader by given block id.
func (db *DBReadOnly) Block(blockID string) (BlockReader, error) {
        select {
        case <-db.closed:
                return nil, ErrClosed
        default:
        }

        _, err := os.Stat(filepath.Join(db.dir, blockID))
        if os.IsNotExist(err) {
                return nil, fmt.Errorf("invalid block ID %s", blockID)
        }

        block, err := OpenBlock(db.logger, filepath.Join(db.dir, blockID), nil)
        if err != nil {
                return nil, err
        }
        db.closers = append(db.closers, block)

        return block, nil
}

// Close all block readers and delete the sandbox dir.
func (db *DBReadOnly) Close() error {
        defer func() {
                // Delete the temporary sandbox directory that was created when opening the DB.
                if err := os.RemoveAll(db.sandboxDir); err != nil {
                        level.Error(db.logger).Log("msg", "delete sandbox dir", "err", err)
                }
        }()
        select {
        case <-db.closed:
                return ErrClosed
        default:
        }
        close(db.closed)

        return tsdb_errors.CloseAll(db.closers)
}

// Open returns a new DB in the given directory. If options are empty, DefaultOptions will be used.
func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, stats *DBStats) (db *DB, err error) {
        var rngs []int64
        opts, rngs = validateOpts(opts, nil)

        return open(dir, l, r, opts, rngs, stats)
}

func validateOpts(opts *Options, rngs []int64) (*Options, []int64) {
        if opts == nil {
                opts = DefaultOptions()
        }
        if opts.StripeSize <= 0 {
                opts.StripeSize = DefaultStripeSize
        }
        if opts.HeadChunksWriteBufferSize <= 0 {
                opts.HeadChunksWriteBufferSize = chunks.DefaultWriteBufferSize
        }
        if opts.HeadChunksWriteQueueSize < 0 {
                opts.HeadChunksWriteQueueSize = chunks.DefaultWriteQueueSize
        }
        if opts.SamplesPerChunk <= 0 {
                opts.SamplesPerChunk = DefaultSamplesPerChunk
        }
        if opts.MaxBlockChunkSegmentSize <= 0 {
                opts.MaxBlockChunkSegmentSize = chunks.DefaultChunkSegmentSize
        }
        if opts.MinBlockDuration <= 0 {
                opts.MinBlockDuration = DefaultBlockDuration
        }
        if opts.MinBlockDuration > opts.MaxBlockDuration {
                opts.MaxBlockDuration = opts.MinBlockDuration
        }
        if opts.OutOfOrderCapMax <= 0 {
                opts.OutOfOrderCapMax = DefaultOutOfOrderCapMax
        }
        if opts.OutOfOrderTimeWindow < 0 {
                opts.OutOfOrderTimeWindow = 0
        }

        if len(rngs) == 0 {
                // Start with smallest block duration and create exponential buckets until the exceed the
                // configured maximum block duration.
                rngs = ExponentialBlockRanges(opts.MinBlockDuration, 10, 3)
        }
        return opts, rngs
}

// open returns a new DB in the given directory.
// It initializes the lockfile, WAL, compactor, and Head (by replaying the WAL), and runs the database.
// It is not safe to open more than one DB in the same directory.
func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs []int64, stats *DBStats) (_ *DB, returnedErr error) {
        if err := os.MkdirAll(dir, 0o777); err != nil {
                return nil, err
        }
        if l == nil {
                l = log.NewNopLogger()
        }
        if stats == nil {
                stats = NewDBStats()
        }

        for i, v := range rngs {
                if v > opts.MaxBlockDuration {
                        rngs = rngs[:i]
                        break
                }
        }

        // Fixup bad format written by Prometheus 2.1.
        if err := repairBadIndexVersion(l, dir); err != nil {
                return nil, fmt.Errorf("repair bad index version: %w", err)
        }

        walDir := filepath.Join(dir, "wal")
        wblDir := filepath.Join(dir, wlog.WblDirName)

        for _, tmpDir := range []string{walDir, dir} {
                // Remove tmp dirs.
                if err := removeBestEffortTmpDirs(l, tmpDir); err != nil {
                        return nil, fmt.Errorf("remove tmp dirs: %w", err)
                }
        }

        db := &DB{
                dir:            dir,
                logger:         l,
                opts:           opts,
                compactc:       make(chan struct{}, 1),
                donec:          make(chan struct{}),
                stopc:          make(chan struct{}),
                autoCompact:    true,
                chunkPool:      chunkenc.NewPool(),
                blocksToDelete: opts.BlocksToDelete,
                registerer:     r,
        }
        defer func() {
                // Close files if startup fails somewhere.
                if returnedErr == nil {
                        return
                }

                close(db.donec) // DB is never run if it was an error, so close this channel here.
                errs := tsdb_errors.NewMulti(returnedErr)
                if err := db.Close(); err != nil {
                        errs.Add(fmt.Errorf("close DB after failed startup: %w", err))
                }
                returnedErr = errs.Err()
        }()

        if db.blocksToDelete == nil {
                db.blocksToDelete = DefaultBlocksToDelete(db)
        }

        var err error
        db.locker, err = tsdbutil.NewDirLocker(dir, "tsdb", db.logger, r)
        if err != nil {
                return nil, err
        }
        if !opts.NoLockfile {
                if err := db.locker.Lock(); err != nil {
                        return nil, err
                }
        }

        ctx, cancel := context.WithCancel(context.Background())
        if opts.NewCompactorFunc != nil {
                db.compactor, err = opts.NewCompactorFunc(ctx, r, l, rngs, db.chunkPool, opts)
        } else {
                db.compactor, err = NewLeveledCompactorWithOptions(ctx, r, l, rngs, db.chunkPool, LeveledCompactorOptions{
                        MaxBlockChunkSegmentSize:    opts.MaxBlockChunkSegmentSize,
                        EnableOverlappingCompaction: opts.EnableOverlappingCompaction,
                })
        }
        if err != nil {
                cancel()
                return nil, fmt.Errorf("create compactor: %w", err)
        }
        db.compactCancel = cancel

        if opts.BlockQuerierFunc == nil {
                db.blockQuerierFunc = NewBlockQuerier
        } else {
                db.blockQuerierFunc = opts.BlockQuerierFunc
        }

        if opts.BlockChunkQuerierFunc == nil {
                db.blockChunkQuerierFunc = NewBlockChunkQuerier
        } else {
                db.blockChunkQuerierFunc = opts.BlockChunkQuerierFunc
        }

        var wal, wbl *wlog.WL
        segmentSize := wlog.DefaultSegmentSize
        // Wal is enabled.
        if opts.WALSegmentSize >= 0 {
                // Wal is set to a custom size.
                if opts.WALSegmentSize > 0 {
                        segmentSize = opts.WALSegmentSize
                }
                wal, err = wlog.NewSize(l, r, walDir, segmentSize, opts.WALCompression)
                if err != nil {
                        return nil, err
                }
                // Check if there is a WBL on disk, in which case we should replay that data.
                wblSize, err := fileutil.DirSize(wblDir)
                if err != nil && !os.IsNotExist(err) {
                        return nil, err
                }
                if opts.OutOfOrderTimeWindow > 0 || wblSize > 0 {
                        wbl, err = wlog.NewSize(l, r, wblDir, segmentSize, opts.WALCompression)
                        if err != nil {
                                return nil, err
                        }
                }
        }
        db.oooWasEnabled.Store(opts.OutOfOrderTimeWindow > 0)
        headOpts := DefaultHeadOptions()
        headOpts.ChunkRange = rngs[0]
        headOpts.ChunkDirRoot = dir
        headOpts.ChunkPool = db.chunkPool
        headOpts.ChunkWriteBufferSize = opts.HeadChunksWriteBufferSize
        headOpts.ChunkWriteQueueSize = opts.HeadChunksWriteQueueSize
        headOpts.SamplesPerChunk = opts.SamplesPerChunk
        headOpts.StripeSize = opts.StripeSize
        headOpts.SeriesCallback = opts.SeriesLifecycleCallback
        headOpts.EnableExemplarStorage = opts.EnableExemplarStorage
        headOpts.MaxExemplars.Store(opts.MaxExemplars)
        headOpts.EnableMemorySnapshotOnShutdown = opts.EnableMemorySnapshotOnShutdown
        headOpts.EnableNativeHistograms.Store(opts.EnableNativeHistograms)
        headOpts.OutOfOrderTimeWindow.Store(opts.OutOfOrderTimeWindow)
        headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax)
        headOpts.EnableSharding = opts.EnableSharding
        if opts.WALReplayConcurrency > 0 {
                headOpts.WALReplayConcurrency = opts.WALReplayConcurrency
        }
        if opts.IsolationDisabled {
                // We only override this flag if isolation is disabled at DB level. We use the default otherwise.
                headOpts.IsolationDisabled = opts.IsolationDisabled
        }
        db.head, err = NewHead(r, l, wal, wbl, headOpts, stats.Head)
        if err != nil {
                return nil, err
        }
        db.head.writeNotified = db.writeNotified

        // Register metrics after assigning the head block.
        db.metrics = newDBMetrics(db, r)
        maxBytes := opts.MaxBytes
        if maxBytes < 0 {
                maxBytes = 0
        }
        db.metrics.maxBytes.Set(float64(maxBytes))
        db.metrics.retentionDuration.Set((time.Duration(opts.RetentionDuration) * time.Millisecond).Seconds())

        if err := db.reload(); err != nil {
                return nil, err
        }
        // Set the min valid time for the ingested samples
        // to be no lower than the maxt of the last block.
        minValidTime := int64(math.MinInt64)
        // We do not consider blocks created from out-of-order samples for Head's minValidTime
        // since minValidTime is only for the in-order data and we do not want to discard unnecessary
        // samples from the Head.
        inOrderMaxTime, ok := db.inOrderBlocksMaxTime()
        if ok {
                minValidTime = inOrderMaxTime
        }

        if initErr := db.head.Init(minValidTime); initErr != nil {
                db.head.metrics.walCorruptionsTotal.Inc()
                var e *errLoadWbl
                if errors.As(initErr, &e) {
                        level.Warn(db.logger).Log("msg", "Encountered WBL read error, attempting repair", "err", initErr)
                        if err := wbl.Repair(e.err); err != nil {
                                return nil, fmt.Errorf("repair corrupted WBL: %w", err)
                        }
                        level.Info(db.logger).Log("msg", "Successfully repaired WBL")
                } else {
                        level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr)
                        if err := wal.Repair(initErr); err != nil {
                                return nil, fmt.Errorf("repair corrupted WAL: %w", err)
                        }
                        level.Info(db.logger).Log("msg", "Successfully repaired WAL")
                }
        }

        if db.head.MinOOOTime() != int64(math.MaxInt64) {
                // Some OOO data was replayed from the disk that needs compaction and cleanup.
                db.oooWasEnabled.Store(true)
        }

        go db.run(ctx)

        return db, nil
}

func removeBestEffortTmpDirs(l log.Logger, dir string) error {
        files, err := os.ReadDir(dir)
        if os.IsNotExist(err) {
                return nil
        }
        if err != nil {
                return err
        }
        for _, f := range files {
                if isTmpDir(f) {
                        if err := os.RemoveAll(filepath.Join(dir, f.Name())); err != nil {
                                level.Error(l).Log("msg", "failed to delete tmp block dir", "dir", filepath.Join(dir, f.Name()), "err", err)
                                continue
                        }
                        level.Info(l).Log("msg", "Found and deleted tmp block dir", "dir", filepath.Join(dir, f.Name()))
                }
        }
        return nil
}

// StartTime implements the Storage interface.
func (db *DB) StartTime() (int64, error) {
        db.mtx.RLock()
        defer db.mtx.RUnlock()

        if len(db.blocks) > 0 {
                return db.blocks[0].Meta().MinTime, nil
        }
        return db.head.MinTime(), nil
}

// Dir returns the directory of the database.
func (db *DB) Dir() string {
        return db.dir
}

func (db *DB) run(ctx context.Context) {
        defer close(db.donec)

        backoff := time.Duration(0)

        for {
                select {
                case <-db.stopc:
                        return
                case <-time.After(backoff):
                }

                select {
                case <-time.After(1 * time.Minute):
                        db.cmtx.Lock()
                        if err := db.reloadBlocks(); err != nil {
                                level.Error(db.logger).Log("msg", "reloadBlocks", "err", err)
                        }
                        db.cmtx.Unlock()

                        select {
                        case db.compactc <- struct{}{}:
                        default:
                        }
                        // We attempt mmapping of head chunks regularly.
                        db.head.mmapHeadChunks()
                case <-db.compactc:
                        db.metrics.compactionsTriggered.Inc()

                        db.autoCompactMtx.Lock()
                        if db.autoCompact {
                                if err := db.Compact(ctx); err != nil {
                                        level.Error(db.logger).Log("msg", "compaction failed", "err", err)
                                        backoff = exponential(backoff, 1*time.Second, 1*time.Minute)
                                } else {
                                        backoff = 0
                                }
                        } else {
                                db.metrics.compactionsSkipped.Inc()
                        }
                        db.autoCompactMtx.Unlock()
                case <-db.stopc:
                        return
                }
        }
}

// Appender opens a new appender against the database.
func (db *DB) Appender(ctx context.Context) storage.Appender {
        return dbAppender{db: db, Appender: db.head.Appender(ctx)}
}

// ApplyConfig applies a new config to the DB.
// Behaviour of 'OutOfOrderTimeWindow' is as follows:
// OOO enabled = oooTimeWindow > 0. OOO disabled = oooTimeWindow is 0.
// 1) Before: OOO disabled, Now: OOO enabled =>
//   - A new WBL is created for the head block.
//   - OOO compaction is enabled.
//   - Overlapping queries are enabled.
//
// 2) Before: OOO enabled, Now: OOO enabled =>
//   - Only the time window is updated.
//
// 3) Before: OOO enabled, Now: OOO disabled =>
//   - Time Window set to 0. So no new OOO samples will be allowed.
//   - OOO WBL will stay and will be eventually cleaned up.
//   - OOO Compaction and overlapping queries will remain enabled until a restart or until all OOO samples are compacted.
//
// 4) Before: OOO disabled, Now: OOO disabled => no-op.
func (db *DB) ApplyConfig(conf *config.Config) error {
        oooTimeWindow := int64(0)
        if conf.StorageConfig.TSDBConfig != nil {
                oooTimeWindow = conf.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
        }
        if oooTimeWindow < 0 {
                oooTimeWindow = 0
        }

        // Create WBL if it was not present and if OOO is enabled with WAL enabled.
        var wblog *wlog.WL
        var err error
        switch {
        case db.head.wbl != nil:
                // The existing WBL from the disk might have been replayed while OOO was disabled.
                wblog = db.head.wbl
        case !db.oooWasEnabled.Load() && oooTimeWindow > 0 && db.opts.WALSegmentSize >= 0:
                segmentSize := wlog.DefaultSegmentSize
                // Wal is set to a custom size.
                if db.opts.WALSegmentSize > 0 {
                        segmentSize = db.opts.WALSegmentSize
                }
                oooWalDir := filepath.Join(db.dir, wlog.WblDirName)
                wblog, err = wlog.NewSize(db.logger, db.registerer, oooWalDir, segmentSize, db.opts.WALCompression)
                if err != nil {
                        return err
                }
        }

        db.opts.OutOfOrderTimeWindow = oooTimeWindow
        db.head.ApplyConfig(conf, wblog)

        if !db.oooWasEnabled.Load() {
                db.oooWasEnabled.Store(oooTimeWindow > 0)
        }
        return nil
}

// EnableNativeHistograms enables the native histogram feature.
func (db *DB) EnableNativeHistograms() {
        db.head.EnableNativeHistograms()
}

// DisableNativeHistograms disables the native histogram feature.
func (db *DB) DisableNativeHistograms() {
        db.head.DisableNativeHistograms()
}

// dbAppender wraps the DB's head appender and triggers compactions on commit
// if necessary.
type dbAppender struct {
        storage.Appender
        db *DB
}

var _ storage.GetRef = dbAppender{}

func (a dbAppender) GetRef(lset labels.Labels, hash uint64) (storage.SeriesRef, labels.Labels) {
        if g, ok := a.Appender.(storage.GetRef); ok {
                return g.GetRef(lset, hash)
        }
        return 0, labels.EmptyLabels()
}

func (a dbAppender) Commit() error {
        err := a.Appender.Commit()

        // We could just run this check every few minutes practically. But for benchmarks
        // and high frequency use cases this is the safer way.
        if a.db.head.compactable() {
                select {
                case a.db.compactc <- struct{}{}:
                default:
                }
        }
        return err
}

// Compact data if possible. After successful compaction blocks are reloaded
// which will also delete the blocks that fall out of the retention window.
// Old blocks are only deleted on reloadBlocks based on the new block's parent information.
// See DB.reloadBlocks documentation for further information.
func (db *DB) Compact(ctx context.Context) (returnErr error) {
        db.cmtx.Lock()
        defer db.cmtx.Unlock()
        defer func() {
                if returnErr != nil && !errors.Is(returnErr, context.Canceled) {
                        // If we got an error because context was canceled then we're most likely
                        // shutting down TSDB and we don't need to report this on metrics
                        db.metrics.compactionsFailed.Inc()
                }
        }()

        lastBlockMaxt := int64(math.MinInt64)
        defer func() {
                errs := tsdb_errors.NewMulti(returnErr)
                if err := db.head.truncateWAL(lastBlockMaxt); err != nil {
                        errs.Add(fmt.Errorf("WAL truncation in Compact defer: %w", err))
                }
                returnErr = errs.Err()
        }()

        start := time.Now()
        // Check whether we have pending head blocks that are ready to be persisted.
        // They have the highest priority.
        for {
                select {
                case <-db.stopc:
                        return nil
                default:
                }
                if !db.head.compactable() {
                        break
                }
                mint := db.head.MinTime()
                maxt := rangeForTimestamp(mint, db.head.chunkRange.Load())

                // Wrap head into a range that bounds all reads to it.
                // We remove 1 millisecond from maxt because block
                // intervals are half-open: [b.MinTime, b.MaxTime). But
                // chunk intervals are closed: [c.MinTime, c.MaxTime];
                // so in order to make sure that overlaps are evaluated
                // consistently, we explicitly remove the last value
                // from the block interval here.
                rh := NewRangeHeadWithIsolationDisabled(db.head, mint, maxt-1)

                // Compaction runs with isolation disabled, because head.compactable()
                // ensures that maxt is more than chunkRange/2 back from now, and
                // head.appendableMinValidTime() ensures that no new appends can start within the compaction range.
                // We do need to wait for any overlapping appenders that started previously to finish.
                db.head.WaitForAppendersOverlapping(rh.MaxTime())

                if err := db.compactHead(rh); err != nil {
                        return fmt.Errorf("compact head: %w", err)
                }
                // Consider only successful compactions for WAL truncation.
                lastBlockMaxt = maxt
        }

        // Clear some disk space before compacting blocks, especially important
        // when Head compaction happened over a long time range.
        if err := db.head.truncateWAL(lastBlockMaxt); err != nil {
                return fmt.Errorf("WAL truncation in Compact: %w", err)
        }

        compactionDuration := time.Since(start)
        if compactionDuration.Milliseconds() > db.head.chunkRange.Load() {
                level.Warn(db.logger).Log(
                        "msg", "Head compaction took longer than the block time range, compactions are falling behind and won't be able to catch up",
                        "duration", compactionDuration.String(),
                        "block_range", db.head.chunkRange.Load(),
                )
        }

        if lastBlockMaxt != math.MinInt64 {
                // The head was compacted, so we compact OOO head as well.
                if err := db.compactOOOHead(ctx); err != nil {
                        return fmt.Errorf("compact ooo head: %w", err)
                }
        }

        return db.compactBlocks()
}

// CompactHead compacts the given RangeHead.
func (db *DB) CompactHead(head *RangeHead) error {
        db.cmtx.Lock()
        defer db.cmtx.Unlock()

        if err := db.compactHead(head); err != nil {
                return fmt.Errorf("compact head: %w", err)
        }

        if err := db.head.truncateWAL(head.BlockMaxTime()); err != nil {
                return fmt.Errorf("WAL truncation: %w", err)
        }
        return nil
}

// CompactOOOHead compacts the OOO Head.
func (db *DB) CompactOOOHead(ctx context.Context) error {
        db.cmtx.Lock()
        defer db.cmtx.Unlock()

        return db.compactOOOHead(ctx)
}

func (db *DB) compactOOOHead(ctx context.Context) error {
        if !db.oooWasEnabled.Load() {
                return nil
        }
        oooHead, err := NewOOOCompactionHead(ctx, db.head)
        if err != nil {
                return fmt.Errorf("get ooo compaction head: %w", err)
        }

        ulids, err := db.compactOOO(db.dir, oooHead)
        if err != nil {
                return fmt.Errorf("compact ooo head: %w", err)
        }
        if err := db.reloadBlocks(); err != nil {
                errs := tsdb_errors.NewMulti(err)
                for _, uid := range ulids {
                        if errRemoveAll := os.RemoveAll(filepath.Join(db.dir, uid.String())); errRemoveAll != nil {
                                errs.Add(errRemoveAll)
                        }
                }
                return fmt.Errorf("reloadBlocks blocks after failed compact ooo head: %w", errs.Err())
        }

        lastWBLFile, minOOOMmapRef := oooHead.LastWBLFile(), oooHead.LastMmapRef()
        if lastWBLFile != 0 || minOOOMmapRef != 0 {
                if minOOOMmapRef != 0 {
                        // Ensure that no more queriers are created that will reference chunks we're about to garbage collect.
                        // truncateOOO waits for any existing queriers that reference chunks we're about to garbage collect to
                        // complete before running garbage collection, so we don't need to do that here.
                        //
                        // We take mtx to ensure that Querier() and ChunkQuerier() don't miss blocks: without this, they could
                        // capture the list of blocks before the call to reloadBlocks() above runs, but then capture
                        // lastGarbageCollectedMmapRef after we update it here, and therefore not query either the blocks we've just
                        // written or the head chunks those blocks were created from.
                        db.mtx.Lock()
                        db.lastGarbageCollectedMmapRef = minOOOMmapRef
                        db.mtx.Unlock()
                }

                if err := db.head.truncateOOO(lastWBLFile, minOOOMmapRef); err != nil {
                        return fmt.Errorf("truncate ooo wbl: %w", err)
                }
        }

        return nil
}

// compactOOO creates a new block per possible block range in the compactor's directory from the OOO Head given.
// Each ULID in the result corresponds to a block in a unique time range.
func (db *DB) compactOOO(dest string, oooHead *OOOCompactionHead) (_ []ulid.ULID, err error) {
        start := time.Now()

        blockSize := oooHead.ChunkRange()
        oooHeadMint, oooHeadMaxt := oooHead.MinTime(), oooHead.MaxTime()
        ulids := make([]ulid.ULID, 0)
        defer func() {
                if err != nil {
                        // Best effort removal of created block on any error.
                        for _, uid := range ulids {
                                _ = os.RemoveAll(filepath.Join(db.dir, uid.String()))
                        }
                }
        }()

        meta := &BlockMeta{}
        meta.Compaction.SetOutOfOrder()
        for t := blockSize * (oooHeadMint / blockSize); t <= oooHeadMaxt; t += blockSize {
                mint, maxt := t, t+blockSize
                // Block intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
                uids, err := db.compactor.Write(dest, oooHead.CloneForTimeRange(mint, maxt-1), mint, maxt, meta)
                if err != nil {
                        return nil, err
                }
                ulids = append(ulids, uids...)
        }

        if len(ulids) == 0 {
                level.Info(db.logger).Log(
                        "msg", "compact ooo head resulted in no blocks",
                        "duration", time.Since(start),
                )
                return nil, nil
        }

        level.Info(db.logger).Log(
                "msg", "out-of-order compaction completed",
                "duration", time.Since(start),
                "ulids", fmt.Sprintf("%v", ulids),
        )
        return ulids, nil
}

// compactHead compacts the given RangeHead.
// The compaction mutex should be held before calling this method.
func (db *DB) compactHead(head *RangeHead) error {
        uids, err := db.compactor.Write(db.dir, head, head.MinTime(), head.BlockMaxTime(), nil)
        if err != nil {
                return fmt.Errorf("persist head block: %w", err)
        }

        if err := db.reloadBlocks(); err != nil {
                multiErr := tsdb_errors.NewMulti(fmt.Errorf("reloadBlocks blocks: %w", err))
                for _, uid := range uids {
                        if errRemoveAll := os.RemoveAll(filepath.Join(db.dir, uid.String())); errRemoveAll != nil {
                                multiErr.Add(fmt.Errorf("delete persisted head block after failed db reloadBlocks:%s: %w", uid, errRemoveAll))
                        }
                }
                return multiErr.Err()
        }
        if err = db.head.truncateMemory(head.BlockMaxTime()); err != nil {
                return fmt.Errorf("head memory truncate: %w", err)
        }
        return nil
}

// compactBlocks compacts all the eligible on-disk blocks.
// The compaction mutex should be held before calling this method.
func (db *DB) compactBlocks() (err error) {
        // Check for compactions of multiple blocks.
        for {
                // If we have a lot of blocks to compact the whole process might take
                // long enough that we end up with a HEAD block that needs to be written.
                // Check if that's the case and stop compactions early.
                if db.head.compactable() {
                        level.Warn(db.logger).Log("msg", "aborting block compactions to persit the head block")
                        return nil
                }

                plan, err := db.compactor.Plan(db.dir)
                if err != nil {
                        return fmt.Errorf("plan compaction: %w", err)
                }
                if len(plan) == 0 {
                        break
                }

                select {
                case <-db.stopc:
                        return nil
                default:
                }

                uids, err := db.compactor.Compact(db.dir, plan, db.blocks)
                if err != nil {
                        return fmt.Errorf("compact %s: %w", plan, err)
                }

                if err := db.reloadBlocks(); err != nil {
                        errs := tsdb_errors.NewMulti(fmt.Errorf("reloadBlocks blocks: %w", err))
                        for _, uid := range uids {
                                if errRemoveAll := os.RemoveAll(filepath.Join(db.dir, uid.String())); errRemoveAll != nil {
                                        errs.Add(fmt.Errorf("delete persisted block after failed db reloadBlocks:%s: %w", uid, errRemoveAll))
                                }
                        }
                        return errs.Err()
                }
        }

        return nil
}

// getBlock iterates a given block range to find a block by a given id.
// If found it returns the block itself and a boolean to indicate that it was found.
func getBlock(allBlocks []*Block, id ulid.ULID) (*Block, bool) {
        for _, b := range allBlocks {
                if b.Meta().ULID == id {
                        return b, true
                }
        }
        return nil, false
}

// reload reloads blocks and truncates the head and its WAL.
func (db *DB) reload() error {
        if err := db.reloadBlocks(); err != nil {
                return fmt.Errorf("reloadBlocks: %w", err)
        }
        maxt, ok := db.inOrderBlocksMaxTime()
        if !ok {
                return nil
        }
        if err := db.head.Truncate(maxt); err != nil {
                return fmt.Errorf("head truncate: %w", err)
        }
        return nil
}

// reloadBlocks reloads blocks without touching head.
// Blocks that are obsolete due to replacement or retention will be deleted.
func (db *DB) reloadBlocks() (err error) {
        defer func() {
                if err != nil {
                        db.metrics.reloadsFailed.Inc()
                }
                db.metrics.reloads.Inc()
        }()

        // Now that we reload TSDB every minute, there is a high chance for a race condition with a reload
        // triggered by CleanTombstones(). We need to lock the reload to avoid the situation where
        // a normal reload and CleanTombstones try to delete the same block.
        db.mtx.Lock()
        defer db.mtx.Unlock()

        loadable, corrupted, err := openBlocks(db.logger, db.dir, db.blocks, db.chunkPool)
        if err != nil {
                return err
        }

        deletableULIDs := db.blocksToDelete(loadable)
        deletable := make(map[ulid.ULID]*Block, len(deletableULIDs))

        // Mark all parents of loaded blocks as deletable (no matter if they exists). This makes it resilient against the process
        // crashing towards the end of a compaction but before deletions. By doing that, we can pick up the deletion where it left off during a crash.
        for _, block := range loadable {
                if _, ok := deletableULIDs[block.meta.ULID]; ok {
                        deletable[block.meta.ULID] = block
                }
                for _, b := range block.Meta().Compaction.Parents {
                        if _, ok := corrupted[b.ULID]; ok {
                                delete(corrupted, b.ULID)
                                level.Warn(db.logger).Log("msg", "Found corrupted block, but replaced by compacted one so it's safe to delete. This should not happen with atomic deletes.", "block", b.ULID)
                        }
                        deletable[b.ULID] = nil
                }
        }

        if len(corrupted) > 0 {
                // Corrupted but no child loaded for it.
                // Close all new blocks to release the lock for windows.
                for _, block := range loadable {
                        if _, open := getBlock(db.blocks, block.Meta().ULID); !open {
                                block.Close()
                        }
                }
                errs := tsdb_errors.NewMulti()
                for ulid, err := range corrupted {
                        if err != nil {
                                errs.Add(fmt.Errorf("corrupted block %s: %w", ulid.String(), err))
                        }
                }
                return errs.Err()
        }

        var (
                toLoad     []*Block
                blocksSize int64
        )
        // All deletable blocks should be unloaded.
        // NOTE: We need to loop through loadable one more time as there might be loadable ready to be removed (replaced by compacted block).
        for _, block := range loadable {
                if _, ok := deletable[block.Meta().ULID]; ok {
                        deletable[block.Meta().ULID] = block
                        continue
                }

                toLoad = append(toLoad, block)
                blocksSize += block.Size()
        }
        db.metrics.blocksBytes.Set(float64(blocksSize))

        slices.SortFunc(toLoad, func(a, b *Block) int {
                switch {
                case a.Meta().MinTime < b.Meta().MinTime:
                        return -1
                case a.Meta().MinTime > b.Meta().MinTime:
                        return 1
                default:
                        return 0
                }
        })

        // Swap new blocks first for subsequently created readers to be seen.
        oldBlocks := db.blocks
        db.blocks = toLoad

        // Only check overlapping blocks when overlapping compaction is enabled.
        if db.opts.EnableOverlappingCompaction {
                blockMetas := make([]BlockMeta, 0, len(toLoad))
                for _, b := range toLoad {
                        blockMetas = append(blockMetas, b.Meta())
                }
                if overlaps := OverlappingBlocks(blockMetas); len(overlaps) > 0 {
                        level.Warn(db.logger).Log("msg", "Overlapping blocks found during reloadBlocks", "detail", overlaps.String())
                }
        }

        // Append blocks to old, deletable blocks, so we can close them.
        for _, b := range oldBlocks {
                if _, ok := deletable[b.Meta().ULID]; ok {
                        deletable[b.Meta().ULID] = b
                }
        }
        if err := db.deleteBlocks(deletable); err != nil {
                return fmt.Errorf("delete %v blocks: %w", len(deletable), err)
        }
        return nil
}

func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Pool) (blocks []*Block, corrupted map[ulid.ULID]error, err error) {
        bDirs, err := blockDirs(dir)
        if err != nil {
                return nil, nil, fmt.Errorf("find blocks: %w", err)
        }

        corrupted = make(map[ulid.ULID]error)
        for _, bDir := range bDirs {
                meta, _, err := readMetaFile(bDir)
                if err != nil {
                        level.Error(l).Log("msg", "Failed to read meta.json for a block during reloadBlocks. Skipping", "dir", bDir, "err", err)
                        continue
                }

                // See if we already have the block in memory or open it otherwise.
                block, open := getBlock(loaded, meta.ULID)
                if !open {
                        block, err = OpenBlock(l, bDir, chunkPool)
                        if err != nil {
                                corrupted[meta.ULID] = err
                                continue
                        }
                }
                blocks = append(blocks, block)
        }
        return blocks, corrupted, nil
}

// DefaultBlocksToDelete returns a filter which decides time based and size based
// retention from the options of the db.
func DefaultBlocksToDelete(db *DB) BlocksToDeleteFunc {
        return func(blocks []*Block) map[ulid.ULID]struct{} {
                return deletableBlocks(db, blocks)
        }
}

// deletableBlocks returns all currently loaded blocks past retention policy or already compacted into a new block.
func deletableBlocks(db *DB, blocks []*Block) map[ulid.ULID]struct{} {
        deletable := make(map[ulid.ULID]struct{})

        // Sort the blocks by time - newest to oldest (largest to smallest timestamp).
        // This ensures that the retentions will remove the oldest  blocks.
        slices.SortFunc(blocks, func(a, b *Block) int {
                switch {
                case b.Meta().MaxTime < a.Meta().MaxTime:
                        return -1
                case b.Meta().MaxTime > a.Meta().MaxTime:
                        return 1
                default:
                        return 0
                }
        })

        for _, block := range blocks {
                if block.Meta().Compaction.Deletable {
                        deletable[block.Meta().ULID] = struct{}{}
                }
        }

        for ulid := range BeyondTimeRetention(db, blocks) {
                deletable[ulid] = struct{}{}
        }

        for ulid := range BeyondSizeRetention(db, blocks) {
                deletable[ulid] = struct{}{}
        }

        return deletable
}

// BeyondTimeRetention returns those blocks which are beyond the time retention
// set in the db options.
func BeyondTimeRetention(db *DB, blocks []*Block) (deletable map[ulid.ULID]struct{}) {
        // Time retention is disabled or no blocks to work with.
        if len(blocks) == 0 || db.opts.RetentionDuration == 0 {
                return
        }

        deletable = make(map[ulid.ULID]struct{})
        for i, block := range blocks {
                // The difference between the first block and this block is greater than or equal to
                // the retention period so any blocks after that are added as deletable.
                if i > 0 && blocks[0].Meta().MaxTime-block.Meta().MaxTime >= db.opts.RetentionDuration {
                        for _, b := range blocks[i:] {
                                deletable[b.meta.ULID] = struct{}{}
                        }
                        db.metrics.timeRetentionCount.Inc()
                        break
                }
        }
        return deletable
}

// BeyondSizeRetention returns those blocks which are beyond the size retention
// set in the db options.
func BeyondSizeRetention(db *DB, blocks []*Block) (deletable map[ulid.ULID]struct{}) {
        // Size retention is disabled or no blocks to work with.
        if len(blocks) == 0 || db.opts.MaxBytes <= 0 {
                return
        }

        deletable = make(map[ulid.ULID]struct{})

        // Initializing size counter with WAL size and Head chunks
        // written to disk, as that is part of the retention strategy.
        blocksSize := db.Head().Size()
        for i, block := range blocks {
                blocksSize += block.Size()
                if blocksSize > db.opts.MaxBytes {
                        // Add this and all following blocks for deletion.
                        for _, b := range blocks[i:] {
                                deletable[b.meta.ULID] = struct{}{}
                        }
                        db.metrics.sizeRetentionCount.Inc()
                        break
                }
        }
        return deletable
}

// deleteBlocks closes the block if loaded and deletes blocks from the disk if exists.
// When the map contains a non nil block object it means it is loaded in memory
// so needs to be closed first as it might need to wait for pending readers to complete.
func (db *DB) deleteBlocks(blocks map[ulid.ULID]*Block) error {
        for ulid, block := range blocks {
                if block != nil {
                        if err := block.Close(); err != nil {
                                level.Warn(db.logger).Log("msg", "Closing block failed", "err", err, "block", ulid)
                        }
                }

                toDelete := filepath.Join(db.dir, ulid.String())
                switch _, err := os.Stat(toDelete); {
                case os.IsNotExist(err):
                        // Noop.
                        continue
                case err != nil:
                        return fmt.Errorf("stat dir %v: %w", toDelete, err)
                }

                // Replace atomically to avoid partial block when process would crash during deletion.
                tmpToDelete := filepath.Join(db.dir, fmt.Sprintf("%s%s", ulid, tmpForDeletionBlockDirSuffix))
                if err := fileutil.Replace(toDelete, tmpToDelete); err != nil {
                        return fmt.Errorf("replace of obsolete block for deletion %s: %w", ulid, err)
                }
                if err := os.RemoveAll(tmpToDelete); err != nil {
                        return fmt.Errorf("delete obsolete block %s: %w", ulid, err)
                }
                level.Info(db.logger).Log("msg", "Deleting obsolete block", "block", ulid)
        }

        return nil
}

// TimeRange specifies minTime and maxTime range.
type TimeRange struct {
        Min, Max int64
}

// Overlaps contains overlapping blocks aggregated by overlapping range.
type Overlaps map[TimeRange][]BlockMeta

// String returns human readable string form of overlapped blocks.
func (o Overlaps) String() string {
        var res []string
        for r, overlaps := range o {
                var groups []string
                for _, m := range overlaps {
                        groups = append(groups, fmt.Sprintf(
                                "<ulid: %s, mint: %d, maxt: %d, range: %s>",
                                m.ULID.String(),
                                m.MinTime,
                                m.MaxTime,
                                (time.Duration((m.MaxTime-m.MinTime)/1000)*time.Second).String(),
                        ))
                }
                res = append(res, fmt.Sprintf(
                        "[mint: %d, maxt: %d, range: %s, blocks: %d]: %s",
                        r.Min, r.Max,
                        (time.Duration((r.Max-r.Min)/1000)*time.Second).String(),
                        len(overlaps),
                        strings.Join(groups, ", ")),
                )
        }
        return strings.Join(res, "\n")
}

// OverlappingBlocks returns all overlapping blocks from given meta files.
func OverlappingBlocks(bm []BlockMeta) Overlaps {
        if len(bm) <= 1 {
                return nil
        }
        var (
                overlaps [][]BlockMeta

                // pending contains not ended blocks in regards to "current" timestamp.
                pending = []BlockMeta{bm[0]}
                // continuousPending helps to aggregate same overlaps to single group.
                continuousPending = true
        )

        // We have here blocks sorted by minTime. We iterate over each block and treat its minTime as our "current" timestamp.
        // We check if any of the pending block finished (blocks that we have seen before, but their maxTime was still ahead current
        // timestamp). If not, it means they overlap with our current block. In the same time current block is assumed pending.
        for _, b := range bm[1:] {
                var newPending []BlockMeta

                for _, p := range pending {
                        // "b.MinTime" is our current time.
                        if b.MinTime >= p.MaxTime {
                                continuousPending = false
                                continue
                        }

                        // "p" overlaps with "b" and "p" is still pending.
                        newPending = append(newPending, p)
                }

                // Our block "b" is now pending.
                pending = append(newPending, b)
                if len(newPending) == 0 {
                        // No overlaps.
                        continue
                }

                if continuousPending && len(overlaps) > 0 {
                        overlaps[len(overlaps)-1] = append(overlaps[len(overlaps)-1], b)
                        continue
                }
                overlaps = append(overlaps, append(newPending, b))
                // Start new pendings.
                continuousPending = true
        }

        // Fetch the critical overlapped time range foreach overlap groups.
        overlapGroups := Overlaps{}
        for _, overlap := range overlaps {
                minRange := TimeRange{Min: 0, Max: math.MaxInt64}
                for _, b := range overlap {
                        if minRange.Max > b.MaxTime {
                                minRange.Max = b.MaxTime
                        }

                        if minRange.Min < b.MinTime {
                                minRange.Min = b.MinTime
                        }
                }
                overlapGroups[minRange] = overlap
        }

        return overlapGroups
}

func (db *DB) String() string {
        return "HEAD"
}

// Blocks returns the databases persisted blocks.
func (db *DB) Blocks() []*Block {
        db.mtx.RLock()
        defer db.mtx.RUnlock()

        return db.blocks
}

// inOrderBlocksMaxTime returns the max time among the blocks that were not totally created
// out of out-of-order data. If the returned boolean is true, it means there is at least
// one such block.
func (db *DB) inOrderBlocksMaxTime() (maxt int64, ok bool) {
        maxt, ok = int64(math.MinInt64), false
        // If blocks are overlapping, last block might not have the max time. So check all blocks.
        for _, b := range db.Blocks() {
                if !b.meta.Compaction.FromOutOfOrder() && b.meta.MaxTime > maxt {
                        ok = true
                        maxt = b.meta.MaxTime
                }
        }
        return maxt, ok
}

// Head returns the databases's head.
func (db *DB) Head() *Head {
        return db.head
}

// Close the partition.
func (db *DB) Close() error {
        close(db.stopc)
        if db.compactCancel != nil {
                db.compactCancel()
        }
        <-db.donec

        db.mtx.Lock()
        defer db.mtx.Unlock()

        var g errgroup.Group

        // blocks also contains all head blocks.
        for _, pb := range db.blocks {
                g.Go(pb.Close)
        }

        errs := tsdb_errors.NewMulti(g.Wait(), db.locker.Release())
        if db.head != nil {
                errs.Add(db.head.Close())
        }
        return errs.Err()
}

// DisableCompactions disables auto compactions.
func (db *DB) DisableCompactions() {
        db.autoCompactMtx.Lock()
        defer db.autoCompactMtx.Unlock()

        db.autoCompact = false
        level.Info(db.logger).Log("msg", "Compactions disabled")
}

// EnableCompactions enables auto compactions.
func (db *DB) EnableCompactions() {
        db.autoCompactMtx.Lock()
        defer db.autoCompactMtx.Unlock()

        db.autoCompact = true
        level.Info(db.logger).Log("msg", "Compactions enabled")
}

// ForceHeadMMap is intended for use only in tests and benchmarks.
func (db *DB) ForceHeadMMap() {
        db.head.mmapHeadChunks()
}

// Snapshot writes the current data to the directory. If withHead is set to true it
// will create a new block containing all data that's currently in the memory buffer/WAL.
func (db *DB) Snapshot(dir string, withHead bool) error {
        if dir == db.dir {
                return fmt.Errorf("cannot snapshot into base directory")
        }
        if _, err := ulid.ParseStrict(dir); err == nil {
                return fmt.Errorf("dir must not be a valid ULID")
        }

        db.cmtx.Lock()
        defer db.cmtx.Unlock()

        db.mtx.RLock()
        defer db.mtx.RUnlock()

        for _, b := range db.blocks {
                level.Info(db.logger).Log("msg", "Snapshotting block", "block", b)

                if err := b.Snapshot(dir); err != nil {
                        return fmt.Errorf("error snapshotting block: %s: %w", b.Dir(), err)
                }
        }
        if !withHead {
                return nil
        }

        mint := db.head.MinTime()
        maxt := db.head.MaxTime()
        head := NewRangeHead(db.head, mint, maxt)
        // Add +1 millisecond to block maxt because block intervals are half-open: [b.MinTime, b.MaxTime).
        // Because of this block intervals are always +1 than the total samples it includes.
        if _, err := db.compactor.Write(dir, head, mint, maxt+1, nil); err != nil {
                return fmt.Errorf("snapshot head block: %w", err)
        }
        return nil
}

// Querier returns a new querier over the data partition for the given time range.
func (db *DB) Querier(mint, maxt int64) (_ storage.Querier, err error) {
        var blocks []BlockReader

        db.mtx.RLock()
        defer db.mtx.RUnlock()

        for _, b := range db.blocks {
                if b.OverlapsClosedInterval(mint, maxt) {
                        blocks = append(blocks, b)
                }
        }

        blockQueriers := make([]storage.Querier, 0, len(blocks)+2) // +2 to allow for possible in-order and OOO head queriers

        defer func() {
                if err != nil {
                        // If we fail, all previously opened queriers must be closed.
                        for _, q := range blockQueriers {
                                // TODO(bwplotka): Handle error.
                                _ = q.Close()
                        }
                }
        }()

        if maxt >= db.head.MinTime() {
                rh := NewRangeHead(db.head, mint, maxt)
                var err error
                inOrderHeadQuerier, err := db.blockQuerierFunc(rh, mint, maxt)
                if err != nil {
                        return nil, fmt.Errorf("open block querier for head %s: %w", rh, err)
                }

                // Getting the querier above registers itself in the queue that the truncation waits on.
                // So if the querier is currently not colliding with any truncation, we can continue to use it and still
                // won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
                shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
                if shouldClose {
                        if err := inOrderHeadQuerier.Close(); err != nil {
                                return nil, fmt.Errorf("closing head block querier %s: %w", rh, err)
                        }
                        inOrderHeadQuerier = nil
                }
                if getNew {
                        rh := NewRangeHead(db.head, newMint, maxt)
                        inOrderHeadQuerier, err = db.blockQuerierFunc(rh, newMint, maxt)
                        if err != nil {
                                return nil, fmt.Errorf("open block querier for head while getting new querier %s: %w", rh, err)
                        }
                }

                if inOrderHeadQuerier != nil {
                        blockQueriers = append(blockQueriers, inOrderHeadQuerier)
                }
        }

        if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
                rh := NewOOORangeHead(db.head, mint, maxt, db.lastGarbageCollectedMmapRef)
                var err error
                outOfOrderHeadQuerier, err := db.blockQuerierFunc(rh, mint, maxt)
                if err != nil {
                        // If BlockQuerierFunc() failed, make sure to clean up the pending read created by NewOOORangeHead.
                        rh.isoState.Close()

                        return nil, fmt.Errorf("open block querier for ooo head %s: %w", rh, err)
                }

                blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
        }

        for _, b := range blocks {
                q, err := db.blockQuerierFunc(b, mint, maxt)
                if err != nil {
                        return nil, fmt.Errorf("open querier for block %s: %w", b, err)
                }
                blockQueriers = append(blockQueriers, q)
        }

        return storage.NewMergeQuerier(blockQueriers, nil, storage.ChainedSeriesMerge), nil
}

// blockChunkQuerierForRange returns individual block chunk queriers from the persistent blocks, in-order head block, and the
// out-of-order head block, overlapping with the given time range.
func (db *DB) blockChunkQuerierForRange(mint, maxt int64) (_ []storage.ChunkQuerier, err error) {
        var blocks []BlockReader

        db.mtx.RLock()
        defer db.mtx.RUnlock()

        for _, b := range db.blocks {
                if b.OverlapsClosedInterval(mint, maxt) {
                        blocks = append(blocks, b)
                }
        }

        blockQueriers := make([]storage.ChunkQuerier, 0, len(blocks)+2) // +2 to allow for possible in-order and OOO head queriers

        defer func() {
                if err != nil {
                        // If we fail, all previously opened queriers must be closed.
                        for _, q := range blockQueriers {
                                // TODO(bwplotka): Handle error.
                                _ = q.Close()
                        }
                }
        }()

        if maxt >= db.head.MinTime() {
                rh := NewRangeHead(db.head, mint, maxt)
                inOrderHeadQuerier, err := db.blockChunkQuerierFunc(rh, mint, maxt)
                if err != nil {
                        return nil, fmt.Errorf("open querier for head %s: %w", rh, err)
                }

                // Getting the querier above registers itself in the queue that the truncation waits on.
                // So if the querier is currently not colliding with any truncation, we can continue to use it and still
                // won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
                shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
                if shouldClose {
                        if err := inOrderHeadQuerier.Close(); err != nil {
                                return nil, fmt.Errorf("closing head querier %s: %w", rh, err)
                        }
                        inOrderHeadQuerier = nil
                }
                if getNew {
                        rh := NewRangeHead(db.head, newMint, maxt)
                        inOrderHeadQuerier, err = db.blockChunkQuerierFunc(rh, newMint, maxt)
                        if err != nil {
                                return nil, fmt.Errorf("open querier for head while getting new querier %s: %w", rh, err)
                        }
                }

                if inOrderHeadQuerier != nil {
                        blockQueriers = append(blockQueriers, inOrderHeadQuerier)
                }
        }

        if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
                rh := NewOOORangeHead(db.head, mint, maxt, db.lastGarbageCollectedMmapRef)
                outOfOrderHeadQuerier, err := db.blockChunkQuerierFunc(rh, mint, maxt)
                if err != nil {
                        // If NewBlockQuerier() failed, make sure to clean up the pending read created by NewOOORangeHead.
                        rh.isoState.Close()

                        return nil, fmt.Errorf("open block chunk querier for ooo head %s: %w", rh, err)
                }

                blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
        }

        for _, b := range blocks {
                q, err := db.blockChunkQuerierFunc(b, mint, maxt)
                if err != nil {
                        return nil, fmt.Errorf("open querier for block %s: %w", b, err)
                }
                blockQueriers = append(blockQueriers, q)
        }

        return blockQueriers, nil
}

// ChunkQuerier returns a new chunk querier over the data partition for the given time range.
func (db *DB) ChunkQuerier(mint, maxt int64) (storage.ChunkQuerier, error) {
        blockQueriers, err := db.blockChunkQuerierForRange(mint, maxt)
        if err != nil {
                return nil, err
        }
        return storage.NewMergeChunkQuerier(blockQueriers, nil, storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)), nil
}

func (db *DB) ExemplarQuerier(ctx context.Context) (storage.ExemplarQuerier, error) {
        return db.head.exemplars.ExemplarQuerier(ctx)
}

func rangeForTimestamp(t, width int64) (maxt int64) {
        return (t/width)*width + width
}

// Delete implements deletion of metrics. It only has atomicity guarantees on a per-block basis.
func (db *DB) Delete(ctx context.Context, mint, maxt int64, ms ...*labels.Matcher) error {
        db.cmtx.Lock()
        defer db.cmtx.Unlock()

        var g errgroup.Group

        db.mtx.RLock()
        defer db.mtx.RUnlock()

        for _, b := range db.blocks {
                if b.OverlapsClosedInterval(mint, maxt) {
                        g.Go(func(b *Block) func() error {
                                return func() error { return b.Delete(ctx, mint, maxt, ms...) }
                        }(b))
                }
        }
        if db.head.OverlapsClosedInterval(mint, maxt) {
                g.Go(func() error {
                        return db.head.Delete(ctx, mint, maxt, ms...)
                })
        }

        return g.Wait()
}

// CleanTombstones re-writes any blocks with tombstones.
func (db *DB) CleanTombstones() (err error) {
        db.cmtx.Lock()
        defer db.cmtx.Unlock()

        start := time.Now()
        defer func() {
                db.metrics.tombCleanTimer.Observe(time.Since(start).Seconds())
        }()

        cleanUpCompleted := false
        // Repeat cleanup until there is no tombstones left.
        for !cleanUpCompleted {
                cleanUpCompleted = true

                for _, pb := range db.Blocks() {
                        uids, safeToDelete, cleanErr := pb.CleanTombstones(db.Dir(), db.compactor)
                        if cleanErr != nil {
                                return fmt.Errorf("clean tombstones: %s: %w", pb.Dir(), cleanErr)
                        }
                        if !safeToDelete {
                                // There was nothing to clean.
                                continue
                        }

                        // In case tombstones of the old block covers the whole block,
                        // then there would be no resultant block to tell the parent.
                        // The lock protects against race conditions when deleting blocks
                        // during an already running reload.
                        db.mtx.Lock()
                        pb.meta.Compaction.Deletable = safeToDelete
                        db.mtx.Unlock()
                        cleanUpCompleted = false
                        if err = db.reloadBlocks(); err == nil { // Will try to delete old block.
                                // Successful reload will change the existing blocks.
                                // We need to loop over the new set of blocks.
                                break
                        }

                        // Delete new block if it was created.
                        for _, uid := range uids {
                                dir := filepath.Join(db.Dir(), uid.String())
                                if err := os.RemoveAll(dir); err != nil {
                                        level.Error(db.logger).Log("msg", "failed to delete block after failed `CleanTombstones`", "dir", dir, "err", err)
                                }
                        }
                        if err != nil {
                                return fmt.Errorf("reload blocks: %w", err)
                        }
                        return nil
                }
        }
        return nil
}

func (db *DB) SetWriteNotified(wn wlog.WriteNotified) {
        db.writeNotified = wn
        // It's possible we already created the head struct, so we should also set the WN for that.
        db.head.writeNotified = wn
}

func isBlockDir(fi fs.DirEntry) bool {
        if !fi.IsDir() {
                return false
        }
        _, err := ulid.ParseStrict(fi.Name())
        return err == nil
}

// isTmpDir returns true if the given file-info contains a block ULID, a checkpoint prefix,
// or a chunk snapshot prefix and a tmp extension.
func isTmpDir(fi fs.DirEntry) bool {
        if !fi.IsDir() {
                return false
        }

        fn := fi.Name()
        ext := filepath.Ext(fn)
        if ext == tmpForDeletionBlockDirSuffix || ext == tmpForCreationBlockDirSuffix || ext == tmpLegacy {
                if strings.HasPrefix(fn, "checkpoint.") {
                        return true
                }
                if strings.HasPrefix(fn, chunkSnapshotPrefix) {
                        return true
                }
                if _, err := ulid.ParseStrict(fn[:len(fn)-len(ext)]); err == nil {
                        return true
                }
        }
        return false
}

func blockDirs(dir string) ([]string, error) {
        files, err := os.ReadDir(dir)
        if err != nil {
                return nil, err
        }
        var dirs []string

        for _, f := range files {
                if isBlockDir(f) {
                        dirs = append(dirs, filepath.Join(dir, f.Name()))
                }
        }
        return dirs, nil
}

func exponential(d, min, max time.Duration) time.Duration {
        d *= 2
        if d < min {
                d = min
        }
        if d > max {
                d = max
        }
        return d
}

// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package encoding

import (
        "encoding/binary"
        "errors"
        "fmt"
        "hash"
        "hash/crc32"
        "math"
        "unsafe"

        "github.com/dennwc/varint"
)

var (
        ErrInvalidSize     = errors.New("invalid size")
        ErrInvalidChecksum = errors.New("invalid checksum")
)

// Encbuf is a helper type to populate a byte slice with various types.
type Encbuf struct {
        B []byte
        C [binary.MaxVarintLen64]byte
}

func (e *Encbuf) Reset()      { e.B = e.B[:0] }
func (e *Encbuf) Get() []byte { return e.B }
func (e *Encbuf) Len() int    { return len(e.B) }

func (e *Encbuf) PutString(s string) { e.B = append(e.B, s...) }
func (e *Encbuf) PutByte(c byte)     { e.B = append(e.B, c) }
func (e *Encbuf) PutBytes(b []byte)  { e.B = append(e.B, b...) }

func (e *Encbuf) PutBE32int(x int)      { e.PutBE32(uint32(x)) }
func (e *Encbuf) PutUvarint32(x uint32) { e.PutUvarint64(uint64(x)) }
func (e *Encbuf) PutBE64int64(x int64)  { e.PutBE64(uint64(x)) }
func (e *Encbuf) PutUvarint(x int)      { e.PutUvarint64(uint64(x)) }

func (e *Encbuf) PutBE32(x uint32) {
        binary.BigEndian.PutUint32(e.C[:], x)
        e.B = append(e.B, e.C[:4]...)
}

func (e *Encbuf) PutBE64(x uint64) {
        binary.BigEndian.PutUint64(e.C[:], x)
        e.B = append(e.B, e.C[:8]...)
}

func (e *Encbuf) PutBEFloat64(x float64) {
        e.PutBE64(math.Float64bits(x))
}

func (e *Encbuf) PutUvarint64(x uint64) {
        n := binary.PutUvarint(e.C[:], x)
        e.B = append(e.B, e.C[:n]...)
}

func (e *Encbuf) PutVarint64(x int64) {
        n := binary.PutVarint(e.C[:], x)
        e.B = append(e.B, e.C[:n]...)
}

// PutUvarintStr writes a string to the buffer prefixed by its varint length (in bytes!).
func (e *Encbuf) PutUvarintStr(s string) {
        b := *(*[]byte)(unsafe.Pointer(&s))
        e.PutUvarint(len(b))
        e.PutString(s)
}

// PutUvarintBytes writes a variable length byte buffer.
func (e *Encbuf) PutUvarintBytes(b []byte) {
        e.PutUvarint(len(b))
        e.PutBytes(b)
}

// PutHash appends a hash over the buffers current contents to the buffer.
func (e *Encbuf) PutHash(h hash.Hash) {
        h.Reset()
        e.WriteToHash(h)
        e.PutHashSum(h)
}

// WriteToHash writes the current buffer contents to the given hash.
func (e *Encbuf) WriteToHash(h hash.Hash) {
        _, err := h.Write(e.B)
        if err != nil {
                panic(err) // The CRC32 implementation does not error
        }
}

// PutHashSum writes the Sum of the given hash to the buffer.
func (e *Encbuf) PutHashSum(h hash.Hash) {
        e.B = h.Sum(e.B)
}

// Decbuf provides safe methods to extract data from a byte slice. It does all
// necessary bounds checking and advancing of the byte slice.
// Several datums can be extracted without checking for errors. However, before using
// any datum, the err() method must be checked.
type Decbuf struct {
        B []byte
        E error
}

// NewDecbufAt returns a new decoding buffer. It expects the first 4 bytes
// after offset to hold the big endian encoded content length, followed by the contents and the expected
// checksum.
func NewDecbufAt(bs ByteSlice, off int, castagnoliTable *crc32.Table) Decbuf {
        if bs.Len() < off+4 {
                return Decbuf{E: ErrInvalidSize}
        }
        b := bs.Range(off, off+4)
        l := int(binary.BigEndian.Uint32(b))

        if bs.Len() < off+4+l+4 {
                return Decbuf{E: ErrInvalidSize}
        }

        // Load bytes holding the contents plus a CRC32 checksum.
        b = bs.Range(off+4, off+4+l+4)
        dec := Decbuf{B: b[:len(b)-4]}

        if castagnoliTable != nil {
                if exp := binary.BigEndian.Uint32(b[len(b)-4:]); dec.Crc32(castagnoliTable) != exp {
                        return Decbuf{E: ErrInvalidChecksum}
                }
        }
        return dec
}

// NewDecbufUvarintAt returns a new decoding buffer. It expects the first bytes
// after offset to hold the uvarint-encoded buffers length, followed by the contents and the expected
// checksum.
func NewDecbufUvarintAt(bs ByteSlice, off int, castagnoliTable *crc32.Table) Decbuf {
        // We never have to access this method at the far end of the byte slice. Thus just checking
        // against the MaxVarintLen32 is sufficient.
        if bs.Len() < off+binary.MaxVarintLen32 {
                return Decbuf{E: ErrInvalidSize}
        }
        b := bs.Range(off, off+binary.MaxVarintLen32)

        l, n := varint.Uvarint(b)
        if n <= 0 || n > binary.MaxVarintLen32 {
                return Decbuf{E: fmt.Errorf("invalid uvarint %d", n)}
        }

        if bs.Len() < off+n+int(l)+4 {
                return Decbuf{E: ErrInvalidSize}
        }

        // Load bytes holding the contents plus a CRC32 checksum.
        b = bs.Range(off+n, off+n+int(l)+4)
        dec := Decbuf{B: b[:len(b)-4]}

        if dec.Crc32(castagnoliTable) != binary.BigEndian.Uint32(b[len(b)-4:]) {
                return Decbuf{E: ErrInvalidChecksum}
        }
        return dec
}

// NewDecbufRaw returns a new decoding buffer of the given length.
func NewDecbufRaw(bs ByteSlice, length int) Decbuf {
        if bs.Len() < length {
                return Decbuf{E: ErrInvalidSize}
        }
        return Decbuf{B: bs.Range(0, length)}
}

func (d *Decbuf) Uvarint() int      { return int(d.Uvarint64()) }
func (d *Decbuf) Uvarint32() uint32 { return uint32(d.Uvarint64()) }
func (d *Decbuf) Be32int() int      { return int(d.Be32()) }
func (d *Decbuf) Be64int64() int64  { return int64(d.Be64()) }

// Crc32 returns a CRC32 checksum over the remaining bytes.
func (d *Decbuf) Crc32(castagnoliTable *crc32.Table) uint32 {
        return crc32.Checksum(d.B, castagnoliTable)
}

func (d *Decbuf) Skip(l int) {
        if len(d.B) < l {
                d.E = ErrInvalidSize
                return
        }
        d.B = d.B[l:]
}

func (d *Decbuf) UvarintStr() string {
        return string(d.UvarintBytes())
}

// The return value becomes invalid if the byte slice goes away.
// Compared to UvarintStr, this avoid allocations.
func (d *Decbuf) UvarintBytes() []byte {
        l := d.Uvarint64()
        if d.E != nil {
                return []byte{}
        }
        if len(d.B) < int(l) {
                d.E = ErrInvalidSize
                return []byte{}
        }
        s := d.B[:l]
        d.B = d.B[l:]
        return s
}

func (d *Decbuf) Varint64() int64 {
        if d.E != nil {
                return 0
        }
        // Decode as unsigned first, since that's what the varint library implements.
        ux, n := varint.Uvarint(d.B)
        if n < 1 {
                d.E = ErrInvalidSize
                return 0
        }
        // Now decode "ZigZag encoding" https://developers.google.com/protocol-buffers/docs/encoding#signed_integers.
        x := int64(ux >> 1)
        if ux&1 != 0 {
                x = ^x
        }
        d.B = d.B[n:]
        return x
}

func (d *Decbuf) Uvarint64() uint64 {
        if d.E != nil {
                return 0
        }
        x, n := varint.Uvarint(d.B)
        if n < 1 {
                d.E = ErrInvalidSize
                return 0
        }
        d.B = d.B[n:]
        return x
}

func (d *Decbuf) Be64() uint64 {
        if d.E != nil {
                return 0
        }
        if len(d.B) < 8 {
                d.E = ErrInvalidSize
                return 0
        }
        x := binary.BigEndian.Uint64(d.B)
        d.B = d.B[8:]
        return x
}

func (d *Decbuf) Be64Float64() float64 {
        return math.Float64frombits(d.Be64())
}

func (d *Decbuf) Be32() uint32 {
        if d.E != nil {
                return 0
        }
        if len(d.B) < 4 {
                d.E = ErrInvalidSize
                return 0
        }
        x := binary.BigEndian.Uint32(d.B)
        d.B = d.B[4:]
        return x
}

func (d *Decbuf) Byte() byte {
        if d.E != nil {
                return 0
        }
        if len(d.B) < 1 {
                d.E = ErrInvalidSize
                return 0
        }
        x := d.B[0]
        d.B = d.B[1:]
        return x
}

func (d *Decbuf) ConsumePadding() {
        if d.E != nil {
                return
        }
        for len(d.B) > 1 && d.B[0] == '\x00' {
                d.B = d.B[1:]
        }
        if len(d.B) < 1 {
                d.E = ErrInvalidSize
        }
}

func (d *Decbuf) Err() error  { return d.E }
func (d *Decbuf) Len() int    { return len(d.B) }
func (d *Decbuf) Get() []byte { return d.B }

// ByteSlice abstracts a byte slice.
type ByteSlice interface {
        Len() int
        Range(start, end int) []byte
}

// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package errors

import (
        "bytes"
        "errors"
        "fmt"
        "io"
)

// multiError type allows combining multiple errors into one.
type multiError []error

// NewMulti returns multiError with provided errors added if not nil.
func NewMulti(errs ...error) multiError { //nolint:revive // unexported-return.
        m := multiError{}
        m.Add(errs...)
        return m
}

// Add adds single or many errors to the error list. Each error is added only if not nil.
// If the error is a nonNilMultiError type, the errors inside nonNilMultiError are added to the main multiError.
func (es *multiError) Add(errs ...error) {
        for _, err := range errs {
                if err == nil {
                        continue
                }
                var merr nonNilMultiError
                if errors.As(err, &merr) {
                        *es = append(*es, merr.errs...)
                        continue
                }
                *es = append(*es, err)
        }
}

// Err returns the error list as an error or nil if it is empty.
func (es multiError) Err() error {
        if len(es) == 0 {
                return nil
        }
        return nonNilMultiError{errs: es}
}

// nonNilMultiError implements the error interface, and it represents
// multiError with at least one error inside it.
// This type is needed to make sure that nil is returned when no error is combined in multiError for err != nil
// check to work.
type nonNilMultiError struct {
        errs multiError
}

// Error returns a concatenated string of the contained errors.
func (es nonNilMultiError) Error() string {
        var buf bytes.Buffer

        if len(es.errs) > 1 {
                fmt.Fprintf(&buf, "%d errors: ", len(es.errs))
        }

        for i, err := range es.errs {
                if i != 0 {
                        buf.WriteString("; ")
                }
                buf.WriteString(err.Error())
        }

        return buf.String()
}

// Is attempts to match the provided error against errors in the error list.
//
// This function allows errors.Is to traverse the values stored in the MultiError.
// It returns true if any of the errors in the list match the target.
func (es nonNilMultiError) Is(target error) bool {
        for _, err := range es.errs {
                if errors.Is(err, target) {
                        return true
                }
        }
        return false
}

// CloseAll closes all given closers while recording error in MultiError.
func CloseAll(cs []io.Closer) error {
        errs := NewMulti()
        for _, c := range cs {
                errs.Add(c.Close())
        }
        return errs.Err()
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "errors"
        "slices"
        "sync"
        "unicode/utf8"

        "github.com/prometheus/client_golang/prometheus"

        "github.com/prometheus/prometheus/config"
        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
)

const (
        // Indicates that there is no index entry for an exmplar.
        noExemplar = -1
        // Estimated number of exemplars per series, for sizing the index.
        estimatedExemplarsPerSeries = 16
)

type CircularExemplarStorage struct {
        lock      sync.RWMutex
        exemplars []circularBufferEntry
        nextIndex int
        metrics   *ExemplarMetrics

        // Map of series labels as a string to index entry, which points to the first
        // and last exemplar for the series in the exemplars circular buffer.
        index map[string]*indexEntry
}

type indexEntry struct {
        oldest       int
        newest       int
        seriesLabels labels.Labels
}

type circularBufferEntry struct {
        exemplar exemplar.Exemplar
        next     int
        ref      *indexEntry
}

type ExemplarMetrics struct {
        exemplarsAppended            prometheus.Counter
        exemplarsInStorage           prometheus.Gauge
        seriesWithExemplarsInStorage prometheus.Gauge
        lastExemplarsTs              prometheus.Gauge
        maxExemplars                 prometheus.Gauge
        outOfOrderExemplars          prometheus.Counter
}

func NewExemplarMetrics(reg prometheus.Registerer) *ExemplarMetrics {
        m := ExemplarMetrics{
                exemplarsAppended: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_exemplar_exemplars_appended_total",
                        Help: "Total number of appended exemplars.",
                }),
                exemplarsInStorage: prometheus.NewGauge(prometheus.GaugeOpts{
                        Name: "prometheus_tsdb_exemplar_exemplars_in_storage",
                        Help: "Number of exemplars currently in circular storage.",
                }),
                seriesWithExemplarsInStorage: prometheus.NewGauge(prometheus.GaugeOpts{
                        Name: "prometheus_tsdb_exemplar_series_with_exemplars_in_storage",
                        Help: "Number of series with exemplars currently in circular storage.",
                }),
                lastExemplarsTs: prometheus.NewGauge(prometheus.GaugeOpts{
                        Name: "prometheus_tsdb_exemplar_last_exemplars_timestamp_seconds",
                        Help: "The timestamp of the oldest exemplar stored in circular storage. Useful to check for what time" +
                                "range the current exemplar buffer limit allows. This usually means the last timestamp" +
                                "for all exemplars for a typical setup. This is not true though if one of the series timestamp is in future compared to rest series.",
                }),
                outOfOrderExemplars: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_exemplar_out_of_order_exemplars_total",
                        Help: "Total number of out of order exemplar ingestion failed attempts.",
                }),
                maxExemplars: prometheus.NewGauge(prometheus.GaugeOpts{
                        Name: "prometheus_tsdb_exemplar_max_exemplars",
                        Help: "Total number of exemplars the exemplar storage can store, resizeable.",
                }),
        }

        if reg != nil {
                reg.MustRegister(
                        m.exemplarsAppended,
                        m.exemplarsInStorage,
                        m.seriesWithExemplarsInStorage,
                        m.lastExemplarsTs,
                        m.outOfOrderExemplars,
                        m.maxExemplars,
                )
        }

        return &m
}

// NewCircularExemplarStorage creates a circular in memory exemplar storage.
// If we assume the average case 95 bytes per exemplar we can fit 5651272 exemplars in
// 1GB of extra memory, accounting for the fact that this is heap allocated space.
// If len <= 0, then the exemplar storage is essentially a noop storage but can later be
// resized to store exemplars.
func NewCircularExemplarStorage(length int64, m *ExemplarMetrics) (ExemplarStorage, error) {
        if length < 0 {
                length = 0
        }
        c := &CircularExemplarStorage{
                exemplars: make([]circularBufferEntry, length),
                index:     make(map[string]*indexEntry, length/estimatedExemplarsPerSeries),
                metrics:   m,
        }

        c.metrics.maxExemplars.Set(float64(length))

        return c, nil
}

func (ce *CircularExemplarStorage) ApplyConfig(cfg *config.Config) error {
        ce.Resize(cfg.StorageConfig.ExemplarsConfig.MaxExemplars)
        return nil
}

func (ce *CircularExemplarStorage) Appender() *CircularExemplarStorage {
        return ce
}

func (ce *CircularExemplarStorage) ExemplarQuerier(_ context.Context) (storage.ExemplarQuerier, error) {
        return ce, nil
}

func (ce *CircularExemplarStorage) Querier(_ context.Context) (storage.ExemplarQuerier, error) {
        return ce, nil
}

// Select returns exemplars for a given set of label matchers.
func (ce *CircularExemplarStorage) Select(start, end int64, matchers ...[]*labels.Matcher) ([]exemplar.QueryResult, error) {
        ret := make([]exemplar.QueryResult, 0)

        if len(ce.exemplars) == 0 {
                return ret, nil
        }

        ce.lock.RLock()
        defer ce.lock.RUnlock()

        // Loop through each index entry, which will point us to first/last exemplar for each series.
        for _, idx := range ce.index {
                var se exemplar.QueryResult
                e := ce.exemplars[idx.oldest]
                if e.exemplar.Ts > end || ce.exemplars[idx.newest].exemplar.Ts < start {
                        continue
                }
                if !matchesSomeMatcherSet(idx.seriesLabels, matchers) {
                        continue
                }
                se.SeriesLabels = idx.seriesLabels

                // Loop through all exemplars in the circular buffer for the current series.
                for e.exemplar.Ts <= end {
                        if e.exemplar.Ts >= start {
                                se.Exemplars = append(se.Exemplars, e.exemplar)
                        }
                        if e.next == noExemplar {
                                break
                        }
                        e = ce.exemplars[e.next]
                }
                if len(se.Exemplars) > 0 {
                        ret = append(ret, se)
                }
        }

        slices.SortFunc(ret, func(a, b exemplar.QueryResult) int {
                return labels.Compare(a.SeriesLabels, b.SeriesLabels)
        })

        return ret, nil
}

func matchesSomeMatcherSet(lbls labels.Labels, matchers [][]*labels.Matcher) bool {
Outer:
        for _, ms := range matchers {
                for _, m := range ms {
                        if !m.Matches(lbls.Get(m.Name)) {
                                continue Outer
                        }
                }
                return true
        }
        return false
}

func (ce *CircularExemplarStorage) ValidateExemplar(l labels.Labels, e exemplar.Exemplar) error {
        var buf [1024]byte
        seriesLabels := l.Bytes(buf[:])

        // TODO(bwplotka): This lock can lock all scrapers, there might high contention on this on scale.
        // Optimize by moving the lock to be per series (& benchmark it).
        ce.lock.RLock()
        defer ce.lock.RUnlock()
        return ce.validateExemplar(ce.index[string(seriesLabels)], e, false)
}

// Not thread safe. The appended parameters tells us whether this is an external validation, or internal
// as a result of an AddExemplar call, in which case we should update any relevant metrics.
func (ce *CircularExemplarStorage) validateExemplar(idx *indexEntry, e exemplar.Exemplar, appended bool) error {
        if len(ce.exemplars) == 0 {
                return storage.ErrExemplarsDisabled
        }

        // Exemplar label length does not include chars involved in text rendering such as quotes
        // equals sign, or commas. See definition of const ExemplarMaxLabelLength.
        labelSetLen := 0
        if err := e.Labels.Validate(func(l labels.Label) error {
                labelSetLen += utf8.RuneCountInString(l.Name)
                labelSetLen += utf8.RuneCountInString(l.Value)

                if labelSetLen > exemplar.ExemplarMaxLabelSetLength {
                        return storage.ErrExemplarLabelLength
                }
                return nil
        }); err != nil {
                return err
        }

        if idx == nil {
                return nil
        }

        // Check for duplicate vs last stored exemplar for this series.
        // NB these are expected, and appending them is a no-op.
        // For floats and classic histograms, there is only 1 exemplar per series,
        // so this is sufficient. For native histograms with multiple exemplars per series,
        // we have another check below.
        newestExemplar := ce.exemplars[idx.newest].exemplar
        if newestExemplar.Equals(e) {
                return storage.ErrDuplicateExemplar
        }

        // Since during the scrape the exemplars are sorted first by timestamp, then value, then labels,
        // if any of these conditions are true, we know that the exemplar is either a duplicate
        // of a previous one (but not the most recent one as that is checked above) or out of order.
        // We now allow exemplars with duplicate timestamps as long as they have different values and/or labels
        // since that can happen for different buckets of a native histogram.
        // We do not distinguish between duplicates and out of order as iterating through the exemplars
        // to check for that would be expensive (versus just comparing with the most recent one) especially
        // since this is run under a lock, and not worth it as we just need to return an error so we do not
        // append the exemplar.
        if e.Ts < newestExemplar.Ts ||
                (e.Ts == newestExemplar.Ts && e.Value < newestExemplar.Value) ||
                (e.Ts == newestExemplar.Ts && e.Value == newestExemplar.Value && e.Labels.Hash() < newestExemplar.Labels.Hash()) {
                if appended {
                        ce.metrics.outOfOrderExemplars.Inc()
                }
                return storage.ErrOutOfOrderExemplar
        }
        return nil
}

// Resize changes the size of exemplar buffer by allocating a new buffer and migrating data to it.
// Exemplars are kept when possible. Shrinking will discard oldest data (in order of ingest) as needed.
func (ce *CircularExemplarStorage) Resize(l int64) int {
        // Accept negative values as just 0 size.
        if l <= 0 {
                l = 0
        }

        if l == int64(len(ce.exemplars)) {
                return 0
        }

        ce.lock.Lock()
        defer ce.lock.Unlock()

        oldBuffer := ce.exemplars
        oldNextIndex := int64(ce.nextIndex)

        ce.exemplars = make([]circularBufferEntry, l)
        ce.index = make(map[string]*indexEntry, l/estimatedExemplarsPerSeries)
        ce.nextIndex = 0

        // Replay as many entries as needed, starting with oldest first.
        count := int64(len(oldBuffer))
        if l < count {
                count = l
        }

        migrated := 0

        if l > 0 && len(oldBuffer) > 0 {
                // Rewind previous next index by count with wrap-around.
                // This math is essentially looking at nextIndex, where we would write the next exemplar to,
                // and find the index in the old exemplar buffer that we should start migrating exemplars from.
                // This way we don't migrate exemplars that would just be overwritten when migrating later exemplars.
                startIndex := (oldNextIndex - count + int64(len(oldBuffer))) % int64(len(oldBuffer))

                var buf [1024]byte
                for i := int64(0); i < count; i++ {
                        idx := (startIndex + i) % int64(len(oldBuffer))
                        if oldBuffer[idx].ref != nil {
                                ce.migrate(&oldBuffer[idx], buf[:])
                                migrated++
                        }
                }
        }

        ce.computeMetrics()
        ce.metrics.maxExemplars.Set(float64(l))

        return migrated
}

// migrate is like AddExemplar but reuses existing structs. Expected to be called in batch and requires
// external lock and does not compute metrics.
func (ce *CircularExemplarStorage) migrate(entry *circularBufferEntry, buf []byte) {
        seriesLabels := entry.ref.seriesLabels.Bytes(buf[:0])

        idx, ok := ce.index[string(seriesLabels)]
        if !ok {
                idx = entry.ref
                idx.oldest = ce.nextIndex
                ce.index[string(seriesLabels)] = idx
        } else {
                entry.ref = idx
                ce.exemplars[idx.newest].next = ce.nextIndex
        }
        idx.newest = ce.nextIndex

        entry.next = noExemplar
        ce.exemplars[ce.nextIndex] = *entry

        ce.nextIndex = (ce.nextIndex + 1) % len(ce.exemplars)
}

func (ce *CircularExemplarStorage) AddExemplar(l labels.Labels, e exemplar.Exemplar) error {
        if len(ce.exemplars) == 0 {
                return storage.ErrExemplarsDisabled
        }

        var buf [1024]byte
        seriesLabels := l.Bytes(buf[:])

        // TODO(bwplotka): This lock can lock all scrapers, there might high contention on this on scale.
        // Optimize by moving the lock to be per series (& benchmark it).
        ce.lock.Lock()
        defer ce.lock.Unlock()

        idx, ok := ce.index[string(seriesLabels)]
        err := ce.validateExemplar(idx, e, true)
        if err != nil {
                if errors.Is(err, storage.ErrDuplicateExemplar) {
                        // Duplicate exemplar, noop.
                        return nil
                }
                return err
        }

        if !ok {
                idx = &indexEntry{oldest: ce.nextIndex, seriesLabels: l}
                ce.index[string(seriesLabels)] = idx
        } else {
                ce.exemplars[idx.newest].next = ce.nextIndex
        }

        if prev := &ce.exemplars[ce.nextIndex]; prev.ref != nil {
                // There exists an exemplar already on this ce.nextIndex entry,
                // drop it, to make place for others.
                if prev.next == noExemplar {
                        // Last item for this series, remove index entry.
                        var buf [1024]byte
                        prevLabels := prev.ref.seriesLabels.Bytes(buf[:])
                        delete(ce.index, string(prevLabels))
                } else {
                        prev.ref.oldest = prev.next
                }
        }

        // Default the next value to -1 (which we use to detect that we've iterated through all exemplars for a series in Select)
        // since this is the first exemplar stored for this series.
        ce.exemplars[ce.nextIndex].next = noExemplar
        ce.exemplars[ce.nextIndex].exemplar = e
        ce.exemplars[ce.nextIndex].ref = idx
        idx.newest = ce.nextIndex

        ce.nextIndex = (ce.nextIndex + 1) % len(ce.exemplars)

        ce.metrics.exemplarsAppended.Inc()
        ce.computeMetrics()
        return nil
}

func (ce *CircularExemplarStorage) computeMetrics() {
        ce.metrics.seriesWithExemplarsInStorage.Set(float64(len(ce.index)))

        if len(ce.exemplars) == 0 {
                ce.metrics.exemplarsInStorage.Set(float64(0))
                ce.metrics.lastExemplarsTs.Set(float64(0))
                return
        }

        if ce.exemplars[ce.nextIndex].ref != nil {
                ce.metrics.exemplarsInStorage.Set(float64(len(ce.exemplars)))
                ce.metrics.lastExemplarsTs.Set(float64(ce.exemplars[ce.nextIndex].exemplar.Ts) / 1000)
                return
        }

        // We did not yet fill the buffer.
        ce.metrics.exemplarsInStorage.Set(float64(ce.nextIndex))
        if ce.exemplars[0].ref != nil {
                ce.metrics.lastExemplarsTs.Set(float64(ce.exemplars[0].exemplar.Ts) / 1000)
        }
}

// IterateExemplars iterates through all the exemplars from oldest to newest appended and calls
// the given function on all of them till the end (or) till the first function call that returns an error.
func (ce *CircularExemplarStorage) IterateExemplars(f func(seriesLabels labels.Labels, e exemplar.Exemplar) error) error {
        ce.lock.RLock()
        defer ce.lock.RUnlock()

        idx := ce.nextIndex
        l := len(ce.exemplars)
        for i := 0; i < l; i, idx = i+1, (idx+1)%l {
                if ce.exemplars[idx].ref == nil {
                        continue
                }
                err := f(ce.exemplars[idx].ref.seriesLabels, ce.exemplars[idx].exemplar)
                if err != nil {
                        return err
                }
        }
        return nil
}

// Copyright 2019 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fileutil

import (
        "os"
        "path/filepath"
)

func DirSize(dir string) (int64, error) {
        var size int64
        err := filepath.Walk(dir, func(filePath string, info os.FileInfo, err error) error {
                if err != nil {
                        return err
                }
                if !info.IsDir() {
                        size += info.Size()
                }
                return nil
        })
        return size, err
}

// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !windows

package fileutil

import "os"

// OpenDir opens a directory for syncing.
func OpenDir(path string) (*os.File, error) { return os.Open(path) }

// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package fileutil provides utility methods used when dealing with the filesystem in tsdb.
// It is largely copied from github.com/coreos/etcd/pkg/fileutil to avoid the
// dependency chain it brings with it.
// Please check github.com/coreos/etcd for licensing information.
package fileutil

import (
        "os"
        "path/filepath"
        "strings"
)

// CopyDirs copies all directories, subdirectories and files recursively including the empty folders.
// Source and destination must be full paths.
func CopyDirs(src, dest string) error {
        if err := os.MkdirAll(dest, 0o777); err != nil {
                return err
        }
        files, err := readDirs(src)
        if err != nil {
                return err
        }

        for _, f := range files {
                dp := filepath.Join(dest, f)
                sp := filepath.Join(src, f)

                stat, err := os.Stat(sp)
                if err != nil {
                        return err
                }

                // Empty directories are also created.
                if stat.IsDir() {
                        if err := os.MkdirAll(dp, 0o777); err != nil {
                                return err
                        }
                        continue
                }

                if err := copyFile(sp, dp); err != nil {
                        return err
                }
        }
        return nil
}

func copyFile(src, dest string) error {
        data, err := os.ReadFile(src)
        if err != nil {
                return err
        }

        err = os.WriteFile(dest, data, 0o666)
        if err != nil {
                return err
        }
        return nil
}

// readDirs reads the source directory recursively and
// returns relative paths to all files and empty directories.
func readDirs(src string) ([]string, error) {
        var files []string

        err := filepath.Walk(src, func(path string, f os.FileInfo, err error) error {
                relativePath := strings.TrimPrefix(path, src)
                if len(relativePath) > 0 {
                        files = append(files, relativePath)
                }
                return nil
        })
        if err != nil {
                return nil, err
        }
        return files, nil
}

// Rename safely renames a file.
func Rename(from, to string) error {
        if err := os.Rename(from, to); err != nil {
                return err
        }

        // Directory was renamed; sync parent dir to persist rename.
        pdir, err := OpenDir(filepath.Dir(to))
        if err != nil {
                return err
        }

        if err = pdir.Sync(); err != nil {
                pdir.Close()
                return err
        }
        return pdir.Close()
}

// Replace moves a file or directory to a new location and deletes any previous data.
// It is not atomic.
func Replace(from, to string) error {
        // Remove destination only if it is a dir otherwise leave it to os.Rename
        // as it replaces the destination file and is atomic.
        {
                f, err := os.Stat(to)
                if !os.IsNotExist(err) {
                        if err == nil && f.IsDir() {
                                if err := os.RemoveAll(to); err != nil {
                                        return err
                                }
                        }
                }
        }

        return Rename(from, to)
}

// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fileutil

import (
        "os"
        "path/filepath"
)

// Releaser provides the Release method to release a file lock.
type Releaser interface {
        Release() error
}

// Flock locks the file with the provided name. If the file does not exist, it is
// created. The returned Releaser is used to release the lock. existed is true
// if the file to lock already existed. A non-nil error is returned if the
// locking has failed. Neither this function nor the returned Releaser is
// goroutine-safe.
func Flock(fileName string) (r Releaser, existed bool, err error) {
        if err = os.MkdirAll(filepath.Dir(fileName), 0o755); err != nil {
                return nil, false, err
        }

        _, err = os.Stat(fileName)
        existed = err == nil

        r, err = newLock(fileName)
        return r, existed, err
}

// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd

package fileutil

import (
        "os"
        "syscall"
)

type unixLock struct {
        f *os.File
}

func (l *unixLock) Release() error {
        if err := l.set(false); err != nil {
                return err
        }
        return l.f.Close()
}

func (l *unixLock) set(lock bool) error {
        how := syscall.LOCK_UN
        if lock {
                how = syscall.LOCK_EX
        }
        return syscall.Flock(int(l.f.Fd()), how|syscall.LOCK_NB)
}

func newLock(fileName string) (Releaser, error) {
        f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0o666)
        if err != nil {
                return nil, err
        }
        l := &unixLock{f}
        err = l.set(true)
        if err != nil {
                f.Close()
                return nil, err
        }
        return l, nil
}

// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fileutil

import (
        "fmt"
        "os"
)

type MmapFile struct {
        f *os.File
        b []byte
}

func OpenMmapFile(path string) (*MmapFile, error) {
        return OpenMmapFileWithSize(path, 0)
}

func OpenMmapFileWithSize(path string, size int) (mf *MmapFile, retErr error) {
        f, err := os.Open(path)
        if err != nil {
                return nil, fmt.Errorf("try lock file: %w", err)
        }
        defer func() {
                if retErr != nil {
                        f.Close()
                }
        }()
        if size <= 0 {
                info, err := f.Stat()
                if err != nil {
                        return nil, fmt.Errorf("stat: %w", err)
                }
                size = int(info.Size())
        }

        b, err := mmap(f, size)
        if err != nil {
                return nil, fmt.Errorf("mmap, size %d: %w", size, err)
        }

        return &MmapFile{f: f, b: b}, nil
}

func (f *MmapFile) Close() error {
        err0 := munmap(f.b)
        err1 := f.f.Close()

        if err0 != nil {
                return err0
        }
        return err1
}

func (f *MmapFile) File() *os.File {
        return f.f
}

func (f *MmapFile) Bytes() []byte {
        return f.b
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !windows && !plan9 && !js

package fileutil

import (
        "os"

        "golang.org/x/sys/unix"
)

func mmap(f *os.File, length int) ([]byte, error) {
        return unix.Mmap(int(f.Fd()), 0, length, unix.PROT_READ, unix.MAP_SHARED)
}

func munmap(b []byte) (err error) {
        return unix.Munmap(b)
}

// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fileutil

import (
        "io"
        "os"
)

// Preallocate tries to allocate the space for given
// file. This operation is only supported on linux by a
// few filesystems (btrfs, ext4, etc.).
// If the operation is unsupported, no error will be returned.
// Otherwise, the error encountered will be returned.
func Preallocate(f *os.File, sizeInBytes int64, extendFile bool) error {
        if sizeInBytes == 0 {
                // fallocate will return EINVAL if length is 0; skip
                return nil
        }
        if extendFile {
                return preallocExtend(f, sizeInBytes)
        }
        return preallocFixed(f, sizeInBytes)
}

func preallocExtendTrunc(f *os.File, sizeInBytes int64) error {
        curOff, err := f.Seek(0, io.SeekCurrent)
        if err != nil {
                return err
        }
        size, err := f.Seek(sizeInBytes, io.SeekEnd)
        if err != nil {
                return err
        }
        if _, err = f.Seek(curOff, io.SeekStart); err != nil {
                return err
        }
        if sizeInBytes > size {
                return nil
        }
        return f.Truncate(sizeInBytes)
}

// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fileutil

import (
        "errors"
        "os"
        "syscall"
)

func preallocExtend(f *os.File, sizeInBytes int64) error {
        // use mode = 0 to change size
        err := syscall.Fallocate(int(f.Fd()), 0, 0, sizeInBytes)
        if err != nil {
                var errno syscall.Errno
                // not supported; fallback
                // fallocate EINTRs frequently in some environments; fallback
                if errors.As(err, &errno) && (errno == syscall.ENOTSUP || errno == syscall.EINTR) {
                        return preallocExtendTrunc(f, sizeInBytes)
                }
        }
        return err
}

func preallocFixed(f *os.File, sizeInBytes int64) error {
        // use mode = 1 to keep size; see FALLOC_FL_KEEP_SIZE
        err := syscall.Fallocate(int(f.Fd()), 1, 0, sizeInBytes)
        if err != nil {
                var errno syscall.Errno
                // treat not supported as nil error
                if errors.As(err, &errno) && errno == syscall.ENOTSUP {
                        return nil
                }
        }
        return err
}

// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build linux

package fileutil

import (
        "os"
        "syscall"
)

// Fdatasync is similar to fsync(), but does not flush modified metadata
// unless that metadata is needed in order to allow a subsequent data retrieval
// to be correctly handled.
func Fdatasync(f *os.File) error {
        return syscall.Fdatasync(int(f.Fd()))
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "errors"
        "fmt"
        "io"
        "math"
        "path/filepath"
        "runtime"
        "strconv"
        "sync"
        "time"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/oklog/ulid"
        "go.uber.org/atomic"

        "github.com/prometheus/client_golang/prometheus"

        "github.com/prometheus/prometheus/config"
        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/metadata"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/index"
        "github.com/prometheus/prometheus/tsdb/record"
        "github.com/prometheus/prometheus/tsdb/tombstones"
        "github.com/prometheus/prometheus/tsdb/wlog"
        "github.com/prometheus/prometheus/util/zeropool"
)

var (
        // ErrInvalidSample is returned if an appended sample is not valid and can't
        // be ingested.
        ErrInvalidSample = errors.New("invalid sample")
        // ErrInvalidExemplar is returned if an appended exemplar is not valid and can't
        // be ingested.
        ErrInvalidExemplar = errors.New("invalid exemplar")
        // ErrAppenderClosed is returned if an appender has already be successfully
        // rolled back or committed.
        ErrAppenderClosed = errors.New("appender closed")

        // defaultIsolationDisabled is true if isolation is disabled by default.
        defaultIsolationDisabled = false

        defaultWALReplayConcurrency = runtime.GOMAXPROCS(0)
)

// Head handles reads and writes of time series data within a time window.
type Head struct {
        chunkRange               atomic.Int64
        numSeries                atomic.Uint64
        minOOOTime, maxOOOTime   atomic.Int64 // TODO(jesusvazquez) These should be updated after garbage collection.
        minTime, maxTime         atomic.Int64 // Current min and max of the samples included in the head. TODO(jesusvazquez) Ensure these are properly tracked.
        minValidTime             atomic.Int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block.
        lastWALTruncationTime    atomic.Int64
        lastMemoryTruncationTime atomic.Int64
        lastSeriesID             atomic.Uint64
        // All the ooo m-map chunks should be after this. This is used to truncate old ooo m-map chunks.
        // This should be typecasted to chunks.ChunkDiskMapperRef after loading.
        minOOOMmapRef atomic.Uint64

        metrics             *headMetrics
        opts                *HeadOptions
        wal, wbl            *wlog.WL
        exemplarMetrics     *ExemplarMetrics
        exemplars           ExemplarStorage
        logger              log.Logger
        appendPool          zeropool.Pool[[]record.RefSample]
        exemplarsPool       zeropool.Pool[[]exemplarWithSeriesRef]
        histogramsPool      zeropool.Pool[[]record.RefHistogramSample]
        floatHistogramsPool zeropool.Pool[[]record.RefFloatHistogramSample]
        metadataPool        zeropool.Pool[[]record.RefMetadata]
        seriesPool          zeropool.Pool[[]*memSeries]
        bytesPool           zeropool.Pool[[]byte]
        memChunkPool        sync.Pool

        // All series addressable by their ID or hash.
        series *stripeSeries

        deletedMtx sync.Mutex
        deleted    map[chunks.HeadSeriesRef]int // Deleted series, and what WAL segment they must be kept until.

        // TODO(codesome): Extend MemPostings to return only OOOPostings, Set OOOStatus, ... Like an additional map of ooo postings.
        postings *index.MemPostings // Postings lists for terms.

        tombstones *tombstones.MemTombstones

        iso *isolation

        oooIso *oooIsolation

        cardinalityMutex      sync.Mutex
        cardinalityCache      *index.PostingsStats // Posting stats cache which will expire after 30sec.
        cardinalityCacheKey   string
        lastPostingsStatsCall time.Duration // Last posting stats call (PostingsCardinalityStats()) time for caching.

        // chunkDiskMapper is used to write and read Head chunks to/from disk.
        chunkDiskMapper *chunks.ChunkDiskMapper

        chunkSnapshotMtx sync.Mutex

        closedMtx sync.Mutex
        closed    bool

        stats *HeadStats
        reg   prometheus.Registerer

        writeNotified wlog.WriteNotified

        memTruncationInProcess atomic.Bool
}

type ExemplarStorage interface {
        storage.ExemplarQueryable
        AddExemplar(labels.Labels, exemplar.Exemplar) error
        ValidateExemplar(labels.Labels, exemplar.Exemplar) error
        IterateExemplars(f func(seriesLabels labels.Labels, e exemplar.Exemplar) error) error
}

// HeadOptions are parameters for the Head block.
type HeadOptions struct {
        // Runtime reloadable option. At the top of the struct for 32 bit OS:
        // https://pkg.go.dev/sync/atomic#pkg-note-BUG
        MaxExemplars atomic.Int64

        OutOfOrderTimeWindow atomic.Int64
        OutOfOrderCapMax     atomic.Int64

        // EnableNativeHistograms enables the ingestion of native histograms.
        EnableNativeHistograms atomic.Bool

        // EnableCreatedTimestampZeroIngestion enables the ingestion of the created timestamp as a synthetic zero sample.
        // See: https://github.com/prometheus/proposals/blob/main/proposals/2023-06-13_created-timestamp.md
        EnableCreatedTimestampZeroIngestion bool

        ChunkRange int64
        // ChunkDirRoot is the parent directory of the chunks directory.
        ChunkDirRoot         string
        ChunkPool            chunkenc.Pool
        ChunkWriteBufferSize int
        ChunkWriteQueueSize  int

        SamplesPerChunk int

        // StripeSize sets the number of entries in the hash map, it must be a power of 2.
        // A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
        // A smaller StripeSize reduces the memory allocated, but can decrease performance with large number of series.
        StripeSize                     int
        SeriesCallback                 SeriesLifecycleCallback
        EnableExemplarStorage          bool
        EnableMemorySnapshotOnShutdown bool

        IsolationDisabled bool

        // Maximum number of CPUs that can simultaneously processes WAL replay.
        // The default value is GOMAXPROCS.
        // If it is set to a negative value or zero, the default value is used.
        WALReplayConcurrency int

        // EnableSharding enables ShardedPostings() support in the Head.
        EnableSharding bool
}

const (
        // DefaultOutOfOrderCapMax is the default maximum size of an in-memory out-of-order chunk.
        DefaultOutOfOrderCapMax int64 = 32
        // DefaultSamplesPerChunk provides a default target number of samples per chunk.
        DefaultSamplesPerChunk = 120
)

func DefaultHeadOptions() *HeadOptions {
        ho := &HeadOptions{
                ChunkRange:           DefaultBlockDuration,
                ChunkDirRoot:         "",
                ChunkPool:            chunkenc.NewPool(),
                ChunkWriteBufferSize: chunks.DefaultWriteBufferSize,
                ChunkWriteQueueSize:  chunks.DefaultWriteQueueSize,
                SamplesPerChunk:      DefaultSamplesPerChunk,
                StripeSize:           DefaultStripeSize,
                SeriesCallback:       &noopSeriesLifecycleCallback{},
                IsolationDisabled:    defaultIsolationDisabled,
                WALReplayConcurrency: defaultWALReplayConcurrency,
        }
        ho.OutOfOrderCapMax.Store(DefaultOutOfOrderCapMax)
        return ho
}

// SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
// It is always a no-op in Prometheus and mainly meant for external users who import TSDB.
// All the callbacks should be safe to be called concurrently.
// It is up to the user to implement soft or hard consistency by making the callbacks
// atomic or non-atomic. Atomic callbacks can cause degradation performance.
type SeriesLifecycleCallback interface {
        // PreCreation is called before creating a series to indicate if the series can be created.
        // A non nil error means the series should not be created.
        PreCreation(labels.Labels) error
        // PostCreation is called after creating a series to indicate a creation of series.
        PostCreation(labels.Labels)
        // PostDeletion is called after deletion of series.
        PostDeletion(map[chunks.HeadSeriesRef]labels.Labels)
}

// NewHead opens the head block in dir.
func NewHead(r prometheus.Registerer, l log.Logger, wal, wbl *wlog.WL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
        var err error
        if l == nil {
                l = log.NewNopLogger()
        }

        if opts.OutOfOrderTimeWindow.Load() < 0 {
                opts.OutOfOrderTimeWindow.Store(0)
        }

        // Time window can be set on runtime. So the capMin and capMax should be valid
        // even if ooo is not enabled yet.
        capMax := opts.OutOfOrderCapMax.Load()
        if capMax <= 0 || capMax > 255 {
                return nil, fmt.Errorf("OOOCapMax of %d is invalid. must be > 0 and <= 255", capMax)
        }

        if opts.ChunkRange < 1 {
                return nil, fmt.Errorf("invalid chunk range %d", opts.ChunkRange)
        }
        if opts.SeriesCallback == nil {
                opts.SeriesCallback = &noopSeriesLifecycleCallback{}
        }

        if stats == nil {
                stats = NewHeadStats()
        }

        if !opts.EnableExemplarStorage {
                opts.MaxExemplars.Store(0)
        }

        h := &Head{
                wal:    wal,
                wbl:    wbl,
                logger: l,
                opts:   opts,
                memChunkPool: sync.Pool{
                        New: func() interface{} {
                                return &memChunk{}
                        },
                },
                stats: stats,
                reg:   r,
        }
        if err := h.resetInMemoryState(); err != nil {
                return nil, err
        }

        if opts.ChunkPool == nil {
                opts.ChunkPool = chunkenc.NewPool()
        }

        if opts.WALReplayConcurrency <= 0 {
                opts.WALReplayConcurrency = defaultWALReplayConcurrency
        }

        h.chunkDiskMapper, err = chunks.NewChunkDiskMapper(
                r,
                mmappedChunksDir(opts.ChunkDirRoot),
                opts.ChunkPool,
                opts.ChunkWriteBufferSize,
                opts.ChunkWriteQueueSize,
        )
        if err != nil {
                return nil, err
        }
        h.metrics = newHeadMetrics(h, r)

        return h, nil
}

func (h *Head) resetInMemoryState() error {
        var err error
        var em *ExemplarMetrics
        if h.exemplars != nil {
                ce, ok := h.exemplars.(*CircularExemplarStorage)
                if ok {
                        em = ce.metrics
                }
        }
        if em == nil {
                em = NewExemplarMetrics(h.reg)
        }
        es, err := NewCircularExemplarStorage(h.opts.MaxExemplars.Load(), em)
        if err != nil {
                return err
        }

        if h.series != nil {
                // reset the existing series to make sure we call the appropriated hooks
                // and increment the series removed metrics
                fs := h.series.iterForDeletion(func(_ int, _ uint64, s *memSeries, flushedForCallback map[chunks.HeadSeriesRef]labels.Labels) {
                        // All series should be flushed
                        flushedForCallback[s.ref] = s.lset
                })
                h.metrics.seriesRemoved.Add(float64(fs))
        }

        h.series = newStripeSeries(h.opts.StripeSize, h.opts.SeriesCallback)
        h.iso = newIsolation(h.opts.IsolationDisabled)
        h.oooIso = newOOOIsolation()
        h.numSeries.Store(0)
        h.exemplarMetrics = em
        h.exemplars = es
        h.postings = index.NewUnorderedMemPostings()
        h.tombstones = tombstones.NewMemTombstones()
        h.deleted = map[chunks.HeadSeriesRef]int{}
        h.chunkRange.Store(h.opts.ChunkRange)
        h.minTime.Store(math.MaxInt64)
        h.maxTime.Store(math.MinInt64)
        h.minOOOTime.Store(math.MaxInt64)
        h.maxOOOTime.Store(math.MinInt64)
        h.lastWALTruncationTime.Store(math.MinInt64)
        h.lastMemoryTruncationTime.Store(math.MinInt64)
        return nil
}

type headMetrics struct {
        activeAppenders           prometheus.Gauge
        series                    prometheus.GaugeFunc
        seriesCreated             prometheus.Counter
        seriesRemoved             prometheus.Counter
        seriesNotFound            prometheus.Counter
        chunks                    prometheus.Gauge
        chunksCreated             prometheus.Counter
        chunksRemoved             prometheus.Counter
        gcDuration                prometheus.Summary
        samplesAppended           *prometheus.CounterVec
        outOfOrderSamplesAppended *prometheus.CounterVec
        outOfBoundSamples         *prometheus.CounterVec
        outOfOrderSamples         *prometheus.CounterVec
        tooOldSamples             *prometheus.CounterVec
        walTruncateDuration       prometheus.Summary
        walCorruptionsTotal       prometheus.Counter
        dataTotalReplayDuration   prometheus.Gauge
        headTruncateFail          prometheus.Counter
        headTruncateTotal         prometheus.Counter
        checkpointDeleteFail      prometheus.Counter
        checkpointDeleteTotal     prometheus.Counter
        checkpointCreationFail    prometheus.Counter
        checkpointCreationTotal   prometheus.Counter
        mmapChunkCorruptionTotal  prometheus.Counter
        snapshotReplayErrorTotal  prometheus.Counter // Will be either 0 or 1.
        oooHistogram              prometheus.Histogram
        mmapChunksTotal           prometheus.Counter
}

const (
        sampleMetricTypeFloat     = "float"
        sampleMetricTypeHistogram = "histogram"
)

func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
        m := &headMetrics{
                activeAppenders: prometheus.NewGauge(prometheus.GaugeOpts{
                        Name: "prometheus_tsdb_head_active_appenders",
                        Help: "Number of currently active appender transactions",
                }),
                series: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                        Name: "prometheus_tsdb_head_series",
                        Help: "Total number of series in the head block.",
                }, func() float64 {
                        return float64(h.NumSeries())
                }),
                seriesCreated: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_head_series_created_total",
                        Help: "Total number of series created in the head",
                }),
                seriesRemoved: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_head_series_removed_total",
                        Help: "Total number of series removed in the head",
                }),
                seriesNotFound: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_head_series_not_found_total",
                        Help: "Total number of requests for series that were not found.",
                }),
                chunks: prometheus.NewGauge(prometheus.GaugeOpts{
                        Name: "prometheus_tsdb_head_chunks",
                        Help: "Total number of chunks in the head block.",
                }),
                chunksCreated: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_head_chunks_created_total",
                        Help: "Total number of chunks created in the head",
                }),
                chunksRemoved: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_head_chunks_removed_total",
                        Help: "Total number of chunks removed in the head",
                }),
                gcDuration: prometheus.NewSummary(prometheus.SummaryOpts{
                        Name: "prometheus_tsdb_head_gc_duration_seconds",
                        Help: "Runtime of garbage collection in the head block.",
                }),
                walTruncateDuration: prometheus.NewSummary(prometheus.SummaryOpts{
                        Name: "prometheus_tsdb_wal_truncate_duration_seconds",
                        Help: "Duration of WAL truncation.",
                }),
                walCorruptionsTotal: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_wal_corruptions_total",
                        Help: "Total number of WAL corruptions.",
                }),
                dataTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
                        Name: "prometheus_tsdb_data_replay_duration_seconds",
                        Help: "Time taken to replay the data on disk.",
                }),
                samplesAppended: prometheus.NewCounterVec(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_head_samples_appended_total",
                        Help: "Total number of appended samples.",
                }, []string{"type"}),
                outOfOrderSamplesAppended: prometheus.NewCounterVec(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_head_out_of_order_samples_appended_total",
                        Help: "Total number of appended out of order samples.",
                }, []string{"type"}),
                outOfBoundSamples: prometheus.NewCounterVec(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_out_of_bound_samples_total",
                        Help: "Total number of out of bound samples ingestion failed attempts with out of order support disabled.",
                }, []string{"type"}),
                outOfOrderSamples: prometheus.NewCounterVec(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_out_of_order_samples_total",
                        Help: "Total number of out of order samples ingestion failed attempts due to out of order being disabled.",
                }, []string{"type"}),
                tooOldSamples: prometheus.NewCounterVec(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_too_old_samples_total",
                        Help: "Total number of out of order samples ingestion failed attempts with out of support enabled, but sample outside of time window.",
                }, []string{"type"}),
                headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_head_truncations_failed_total",
                        Help: "Total number of head truncations that failed.",
                }),
                headTruncateTotal: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_head_truncations_total",
                        Help: "Total number of head truncations attempted.",
                }),
                checkpointDeleteFail: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_checkpoint_deletions_failed_total",
                        Help: "Total number of checkpoint deletions that failed.",
                }),
                checkpointDeleteTotal: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_checkpoint_deletions_total",
                        Help: "Total number of checkpoint deletions attempted.",
                }),
                checkpointCreationFail: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_checkpoint_creations_failed_total",
                        Help: "Total number of checkpoint creations that failed.",
                }),
                checkpointCreationTotal: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_checkpoint_creations_total",
                        Help: "Total number of checkpoint creations attempted.",
                }),
                mmapChunkCorruptionTotal: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_mmap_chunk_corruptions_total",
                        Help: "Total number of memory-mapped chunk corruptions.",
                }),
                snapshotReplayErrorTotal: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_snapshot_replay_error_total",
                        Help: "Total number snapshot replays that failed.",
                }),
                oooHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
                        Name: "prometheus_tsdb_sample_ooo_delta",
                        Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",
                        Buckets: []float64{
                                60 * 10,      // 10 min
                                60 * 30,      // 30 min
                                60 * 60,      // 60 min
                                60 * 60 * 2,  // 2h
                                60 * 60 * 3,  // 3h
                                60 * 60 * 6,  // 6h
                                60 * 60 * 12, // 12h
                        },
                        NativeHistogramBucketFactor:     1.1,
                        NativeHistogramMaxBucketNumber:  100,
                        NativeHistogramMinResetDuration: 1 * time.Hour,
                }),
                mmapChunksTotal: prometheus.NewCounter(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_mmap_chunks_total",
                        Help: "Total number of chunks that were memory-mapped.",
                }),
        }

        if r != nil {
                r.MustRegister(
                        m.activeAppenders,
                        m.series,
                        m.chunks,
                        m.chunksCreated,
                        m.chunksRemoved,
                        m.seriesCreated,
                        m.seriesRemoved,
                        m.seriesNotFound,
                        m.gcDuration,
                        m.walTruncateDuration,
                        m.walCorruptionsTotal,
                        m.dataTotalReplayDuration,
                        m.samplesAppended,
                        m.outOfOrderSamplesAppended,
                        m.outOfBoundSamples,
                        m.outOfOrderSamples,
                        m.tooOldSamples,
                        m.headTruncateFail,
                        m.headTruncateTotal,
                        m.checkpointDeleteFail,
                        m.checkpointDeleteTotal,
                        m.checkpointCreationFail,
                        m.checkpointCreationTotal,
                        m.mmapChunksTotal,
                        m.mmapChunkCorruptionTotal,
                        m.snapshotReplayErrorTotal,
                        // Metrics bound to functions and not needed in tests
                        // can be created and registered on the spot.
                        prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                                Name: "prometheus_tsdb_head_max_time",
                                Help: "Maximum timestamp of the head block. The unit is decided by the library consumer.",
                        }, func() float64 {
                                return float64(h.MaxTime())
                        }),
                        prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                                Name: "prometheus_tsdb_head_min_time",
                                Help: "Minimum time bound of the head block. The unit is decided by the library consumer.",
                        }, func() float64 {
                                return float64(h.MinTime())
                        }),
                        prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                                Name: "prometheus_tsdb_isolation_low_watermark",
                                Help: "The lowest TSDB append ID that is still referenced.",
                        }, func() float64 {
                                return float64(h.iso.lowWatermark())
                        }),
                        prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                                Name: "prometheus_tsdb_isolation_high_watermark",
                                Help: "The highest TSDB append ID that has been given out.",
                        }, func() float64 {
                                return float64(h.iso.lastAppendID())
                        }),
                        prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                                Name: "prometheus_tsdb_head_chunks_storage_size_bytes",
                                Help: "Size of the chunks_head directory.",
                        }, func() float64 {
                                val, err := h.chunkDiskMapper.Size()
                                if err != nil {
                                        level.Error(h.logger).Log("msg", "Failed to calculate size of \"chunks_head\" dir",
                                                "err", err.Error())
                                }
                                return float64(val)
                        }),
                )
        }
        return m
}

func mmappedChunksDir(dir string) string { return filepath.Join(dir, "chunks_head") }

// HeadStats are the statistics for the head component of the DB.
type HeadStats struct {
        WALReplayStatus *WALReplayStatus
}

// NewHeadStats returns a new HeadStats object.
func NewHeadStats() *HeadStats {
        return &HeadStats{
                WALReplayStatus: &WALReplayStatus{},
        }
}

// WALReplayStatus contains status information about the WAL replay.
type WALReplayStatus struct {
        sync.RWMutex
        Min     int
        Max     int
        Current int
}

// GetWALReplayStatus returns the WAL replay status information.
func (s *WALReplayStatus) GetWALReplayStatus() WALReplayStatus {
        s.RLock()
        defer s.RUnlock()

        return WALReplayStatus{
                Min:     s.Min,
                Max:     s.Max,
                Current: s.Current,
        }
}

const cardinalityCacheExpirationTime = time.Duration(30) * time.Second

// Init loads data from the write ahead log and prepares the head for writes.
// It should be called before using an appender so that it
// limits the ingested samples to the head min valid time.
func (h *Head) Init(minValidTime int64) error {
        h.minValidTime.Store(minValidTime)
        defer func() {
                h.postings.EnsureOrder(h.opts.WALReplayConcurrency)
        }()
        defer h.gc() // After loading the wal remove the obsolete data from the head.
        defer func() {
                // Loading of m-mapped chunks and snapshot can make the mint of the Head
                // to go below minValidTime.
                if h.MinTime() < h.minValidTime.Load() {
                        h.minTime.Store(h.minValidTime.Load())
                }
        }()

        level.Info(h.logger).Log("msg", "Replaying on-disk memory mappable chunks if any")
        start := time.Now()

        snapIdx, snapOffset := -1, 0
        refSeries := make(map[chunks.HeadSeriesRef]*memSeries)

        snapshotLoaded := false
        var chunkSnapshotLoadDuration time.Duration
        if h.opts.EnableMemorySnapshotOnShutdown {
                level.Info(h.logger).Log("msg", "Chunk snapshot is enabled, replaying from the snapshot")
                // If there are any WAL files, there should be at least one WAL file with an index that is current or newer
                // than the snapshot index. If the WAL index is behind the snapshot index somehow, the snapshot is assumed
                // to be outdated.
                loadSnapshot := true
                if h.wal != nil {
                        _, endAt, err := wlog.Segments(h.wal.Dir())
                        if err != nil {
                                return fmt.Errorf("finding WAL segments: %w", err)
                        }

                        _, idx, _, err := LastChunkSnapshot(h.opts.ChunkDirRoot)
                        if err != nil && !errors.Is(err, record.ErrNotFound) {
                                level.Error(h.logger).Log("msg", "Could not find last snapshot", "err", err)
                        }

                        if err == nil && endAt < idx {
                                loadSnapshot = false
                                level.Warn(h.logger).Log("msg", "Last WAL file is behind snapshot, removing snapshots")
                                if err := DeleteChunkSnapshots(h.opts.ChunkDirRoot, math.MaxInt, math.MaxInt); err != nil {
                                        level.Error(h.logger).Log("msg", "Error while deleting snapshot directories", "err", err)
                                }
                        }
                }
                if loadSnapshot {
                        var err error
                        snapIdx, snapOffset, refSeries, err = h.loadChunkSnapshot()
                        if err == nil {
                                snapshotLoaded = true
                                chunkSnapshotLoadDuration = time.Since(start)
                                level.Info(h.logger).Log("msg", "Chunk snapshot loading time", "duration", chunkSnapshotLoadDuration.String())
                        }
                        if err != nil {
                                snapIdx, snapOffset = -1, 0
                                refSeries = make(map[chunks.HeadSeriesRef]*memSeries)

                                h.metrics.snapshotReplayErrorTotal.Inc()
                                level.Error(h.logger).Log("msg", "Failed to load chunk snapshot", "err", err)
                                // We clear the partially loaded data to replay fresh from the WAL.
                                if err := h.resetInMemoryState(); err != nil {
                                        return err
                                }
                        }
                }
        }

        mmapChunkReplayStart := time.Now()
        var (
                mmappedChunks    map[chunks.HeadSeriesRef][]*mmappedChunk
                oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk
                lastMmapRef      chunks.ChunkDiskMapperRef
                err              error

                mmapChunkReplayDuration time.Duration
        )
        if snapshotLoaded || h.wal != nil {
                // If snapshot was not loaded and if there is no WAL, then m-map chunks will be discarded
                // anyway. So we only load m-map chunks when it won't be discarded.
                mmappedChunks, oooMmappedChunks, lastMmapRef, err = h.loadMmappedChunks(refSeries)
                if err != nil {
                        // TODO(codesome): clear out all m-map chunks here for refSeries.
                        level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
                        var cerr *chunks.CorruptionErr
                        if errors.As(err, &cerr) {
                                h.metrics.mmapChunkCorruptionTotal.Inc()
                        }

                        // Discard snapshot data since we need to replay the WAL for the missed m-map chunks data.
                        snapIdx, snapOffset = -1, 0

                        // If this fails, data will be recovered from WAL.
                        // Hence we wont lose any data (given WAL is not corrupt).
                        mmappedChunks, oooMmappedChunks, lastMmapRef, err = h.removeCorruptedMmappedChunks(err)
                        if err != nil {
                                return err
                        }
                }
                mmapChunkReplayDuration = time.Since(mmapChunkReplayStart)
                level.Info(h.logger).Log("msg", "On-disk memory mappable chunks replay completed", "duration", mmapChunkReplayDuration.String())
        }

        if h.wal == nil {
                level.Info(h.logger).Log("msg", "WAL not found")
                return nil
        }

        level.Info(h.logger).Log("msg", "Replaying WAL, this may take a while")

        checkpointReplayStart := time.Now()
        // Backfill the checkpoint first if it exists.
        dir, startFrom, err := wlog.LastCheckpoint(h.wal.Dir())
        if err != nil && !errors.Is(err, record.ErrNotFound) {
                return fmt.Errorf("find last checkpoint: %w", err)
        }

        // Find the last segment.
        _, endAt, e := wlog.Segments(h.wal.Dir())
        if e != nil {
                return fmt.Errorf("finding WAL segments: %w", e)
        }

        h.startWALReplayStatus(startFrom, endAt)

        syms := labels.NewSymbolTable() // One table for the whole WAL.
        multiRef := map[chunks.HeadSeriesRef]chunks.HeadSeriesRef{}
        if err == nil && startFrom >= snapIdx {
                sr, err := wlog.NewSegmentsReader(dir)
                if err != nil {
                        return fmt.Errorf("open checkpoint: %w", err)
                }
                defer func() {
                        if err := sr.Close(); err != nil {
                                level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
                        }
                }()

                // A corrupted checkpoint is a hard error for now and requires user
                // intervention. There's likely little data that can be recovered anyway.
                if err := h.loadWAL(wlog.NewReader(sr), syms, multiRef, mmappedChunks, oooMmappedChunks); err != nil {
                        return fmt.Errorf("backfill checkpoint: %w", err)
                }
                h.updateWALReplayStatusRead(startFrom)
                startFrom++
                level.Info(h.logger).Log("msg", "WAL checkpoint loaded")
        }
        checkpointReplayDuration := time.Since(checkpointReplayStart)

        walReplayStart := time.Now()

        if snapIdx > startFrom {
                startFrom = snapIdx
        }
        // Backfill segments from the most recent checkpoint onwards.
        for i := startFrom; i <= endAt; i++ {
                s, err := wlog.OpenReadSegment(wlog.SegmentName(h.wal.Dir(), i))
                if err != nil {
                        return fmt.Errorf("open WAL segment: %d: %w", i, err)
                }

                offset := 0
                if i == snapIdx {
                        offset = snapOffset
                }
                sr, err := wlog.NewSegmentBufReaderWithOffset(offset, s)
                if errors.Is(err, io.EOF) {
                        // File does not exist.
                        continue
                }
                if err != nil {
                        return fmt.Errorf("segment reader (offset=%d): %w", offset, err)
                }
                err = h.loadWAL(wlog.NewReader(sr), syms, multiRef, mmappedChunks, oooMmappedChunks)
                if err := sr.Close(); err != nil {
                        level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
                }
                if err != nil {
                        return err
                }
                level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", endAt)
                h.updateWALReplayStatusRead(i)
        }
        walReplayDuration := time.Since(walReplayStart)

        wblReplayStart := time.Now()
        if h.wbl != nil {
                // Replay WBL.
                startFrom, endAt, e = wlog.Segments(h.wbl.Dir())
                if e != nil {
                        return &errLoadWbl{fmt.Errorf("finding WBL segments: %w", e)}
                }
                h.startWALReplayStatus(startFrom, endAt)

                for i := startFrom; i <= endAt; i++ {
                        s, err := wlog.OpenReadSegment(wlog.SegmentName(h.wbl.Dir(), i))
                        if err != nil {
                                return &errLoadWbl{fmt.Errorf("open WBL segment: %d: %w", i, err)}
                        }

                        sr := wlog.NewSegmentBufReader(s)
                        err = h.loadWBL(wlog.NewReader(sr), syms, multiRef, lastMmapRef)
                        if err := sr.Close(); err != nil {
                                level.Warn(h.logger).Log("msg", "Error while closing the wbl segments reader", "err", err)
                        }
                        if err != nil {
                                return &errLoadWbl{err}
                        }
                        level.Info(h.logger).Log("msg", "WBL segment loaded", "segment", i, "maxSegment", endAt)
                        h.updateWALReplayStatusRead(i)
                }
        }

        wblReplayDuration := time.Since(wblReplayStart)

        totalReplayDuration := time.Since(start)
        h.metrics.dataTotalReplayDuration.Set(totalReplayDuration.Seconds())
        level.Info(h.logger).Log(
                "msg", "WAL replay completed",
                "checkpoint_replay_duration", checkpointReplayDuration.String(),
                "wal_replay_duration", walReplayDuration.String(),
                "wbl_replay_duration", wblReplayDuration.String(),
                "chunk_snapshot_load_duration", chunkSnapshotLoadDuration.String(),
                "mmap_chunk_replay_duration", mmapChunkReplayDuration.String(),
                "total_replay_duration", totalReplayDuration.String(),
        )

        return nil
}

func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
        mmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
        oooMmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
        var lastRef, secondLastRef chunks.ChunkDiskMapperRef
        if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding, isOOO bool) error {
                secondLastRef = lastRef
                lastRef = chunkRef
                if !isOOO && maxt < h.minValidTime.Load() {
                        return nil
                }

                // We ignore any chunk that doesn't have a valid encoding
                if !chunkenc.IsValidEncoding(encoding) {
                        return nil
                }

                ms, ok := refSeries[seriesRef]

                if isOOO {
                        if !ok {
                                oooMmappedChunks[seriesRef] = append(oooMmappedChunks[seriesRef], &mmappedChunk{
                                        ref:        chunkRef,
                                        minTime:    mint,
                                        maxTime:    maxt,
                                        numSamples: numSamples,
                                })
                                return nil
                        }

                        h.metrics.chunks.Inc()
                        h.metrics.chunksCreated.Inc()

                        if ms.ooo == nil {
                                ms.ooo = &memSeriesOOOFields{}
                        }

                        ms.ooo.oooMmappedChunks = append(ms.ooo.oooMmappedChunks, &mmappedChunk{
                                ref:        chunkRef,
                                minTime:    mint,
                                maxTime:    maxt,
                                numSamples: numSamples,
                        })

                        h.updateMinOOOMaxOOOTime(mint, maxt)
                        return nil
                }

                if !ok {
                        slice := mmappedChunks[seriesRef]
                        if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint {
                                h.metrics.mmapChunkCorruptionTotal.Inc()
                                return fmt.Errorf("out of sequence m-mapped chunk for series ref %d, last chunk: [%d, %d], new: [%d, %d]",
                                        seriesRef, slice[len(slice)-1].minTime, slice[len(slice)-1].maxTime, mint, maxt)
                        }
                        slice = append(slice, &mmappedChunk{
                                ref:        chunkRef,
                                minTime:    mint,
                                maxTime:    maxt,
                                numSamples: numSamples,
                        })
                        mmappedChunks[seriesRef] = slice
                        return nil
                }

                if len(ms.mmappedChunks) > 0 && ms.mmappedChunks[len(ms.mmappedChunks)-1].maxTime >= mint {
                        h.metrics.mmapChunkCorruptionTotal.Inc()
                        return fmt.Errorf("out of sequence m-mapped chunk for series ref %d, last chunk: [%d, %d], new: [%d, %d]",
                                seriesRef, ms.mmappedChunks[len(ms.mmappedChunks)-1].minTime, ms.mmappedChunks[len(ms.mmappedChunks)-1].maxTime,
                                mint, maxt)
                }

                h.metrics.chunks.Inc()
                h.metrics.chunksCreated.Inc()
                ms.mmappedChunks = append(ms.mmappedChunks, &mmappedChunk{
                        ref:        chunkRef,
                        minTime:    mint,
                        maxTime:    maxt,
                        numSamples: numSamples,
                })
                h.updateMinMaxTime(mint, maxt)
                if ms.headChunks != nil && maxt >= ms.headChunks.minTime {
                        // The head chunk was completed and was m-mapped after taking the snapshot.
                        // Hence remove this chunk.
                        ms.nextAt = 0
                        ms.headChunks = nil
                        ms.app = nil
                }
                return nil
        }); err != nil {
                // secondLastRef because the lastRef caused an error.
                return nil, nil, secondLastRef, fmt.Errorf("iterate on on-disk chunks: %w", err)
        }
        return mmappedChunks, oooMmappedChunks, lastRef, nil
}

// removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
// loaded mmapped chunks.
func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
        level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
        // We never want to preserve the in-memory series from snapshots if we are repairing m-map chunks.
        if err := h.resetInMemoryState(); err != nil {
                return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, err
        }

        level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")

        if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
                level.Info(h.logger).Log("msg", "Deletion of corrupted mmap chunk files failed, discarding chunk files completely", "err", err)
                if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
                        level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed", "err", err)
                }
                return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, nil
        }

        level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
        mmappedChunks, oooMmappedChunks, lastRef, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries))
        if err != nil {
                level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
                if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
                        level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed after failed loading", "err", err)
                }
                mmappedChunks = map[chunks.HeadSeriesRef][]*mmappedChunk{}
        }

        return mmappedChunks, oooMmappedChunks, lastRef, nil
}

func (h *Head) ApplyConfig(cfg *config.Config, wbl *wlog.WL) {
        oooTimeWindow := int64(0)
        if cfg.StorageConfig.TSDBConfig != nil {
                oooTimeWindow = cfg.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
        }
        if oooTimeWindow < 0 {
                oooTimeWindow = 0
        }

        h.SetOutOfOrderTimeWindow(oooTimeWindow, wbl)

        if !h.opts.EnableExemplarStorage {
                return
        }

        // Head uses opts.MaxExemplars in combination with opts.EnableExemplarStorage
        // to decide if it should pass exemplars along to its exemplar storage, so we
        // need to update opts.MaxExemplars here.
        prevSize := h.opts.MaxExemplars.Load()
        h.opts.MaxExemplars.Store(cfg.StorageConfig.ExemplarsConfig.MaxExemplars)
        newSize := h.opts.MaxExemplars.Load()

        if prevSize == newSize {
                return
        }

        migrated := h.exemplars.(*CircularExemplarStorage).Resize(newSize)
        level.Info(h.logger).Log("msg", "Exemplar storage resized", "from", prevSize, "to", newSize, "migrated", migrated)
}

// SetOutOfOrderTimeWindow updates the out of order related parameters.
// If the Head already has a WBL set, then the wbl will be ignored.
func (h *Head) SetOutOfOrderTimeWindow(oooTimeWindow int64, wbl *wlog.WL) {
        if oooTimeWindow > 0 && h.wbl == nil {
                h.wbl = wbl
        }

        h.opts.OutOfOrderTimeWindow.Store(oooTimeWindow)
}

// EnableNativeHistograms enables the native histogram feature.
func (h *Head) EnableNativeHistograms() {
        h.opts.EnableNativeHistograms.Store(true)
}

// DisableNativeHistograms disables the native histogram feature.
func (h *Head) DisableNativeHistograms() {
        h.opts.EnableNativeHistograms.Store(false)
}

// PostingsCardinalityStats returns highest cardinality stats by label and value names.
func (h *Head) PostingsCardinalityStats(statsByLabelName string, limit int) *index.PostingsStats {
        cacheKey := statsByLabelName + ";" + strconv.Itoa(limit)

        h.cardinalityMutex.Lock()
        defer h.cardinalityMutex.Unlock()
        if h.cardinalityCacheKey != cacheKey {
                h.cardinalityCache = nil
        } else {
                currentTime := time.Duration(time.Now().Unix()) * time.Second
                seconds := currentTime - h.lastPostingsStatsCall
                if seconds > cardinalityCacheExpirationTime {
                        h.cardinalityCache = nil
                }
        }
        if h.cardinalityCache != nil {
                return h.cardinalityCache
        }
        h.cardinalityCacheKey = cacheKey
        h.cardinalityCache = h.postings.Stats(statsByLabelName, limit)
        h.lastPostingsStatsCall = time.Duration(time.Now().Unix()) * time.Second

        return h.cardinalityCache
}

func (h *Head) updateMinMaxTime(mint, maxt int64) {
        for {
                lt := h.MinTime()
                if mint >= lt {
                        break
                }
                if h.minTime.CompareAndSwap(lt, mint) {
                        break
                }
        }
        for {
                ht := h.MaxTime()
                if maxt <= ht {
                        break
                }
                if h.maxTime.CompareAndSwap(ht, maxt) {
                        break
                }
        }
}

func (h *Head) updateMinOOOMaxOOOTime(mint, maxt int64) {
        for {
                lt := h.MinOOOTime()
                if mint >= lt {
                        break
                }
                if h.minOOOTime.CompareAndSwap(lt, mint) {
                        break
                }
        }
        for {
                ht := h.MaxOOOTime()
                if maxt <= ht {
                        break
                }
                if h.maxOOOTime.CompareAndSwap(ht, maxt) {
                        break
                }
        }
}

// SetMinValidTime sets the minimum timestamp the head can ingest.
func (h *Head) SetMinValidTime(minValidTime int64) {
        h.minValidTime.Store(minValidTime)
}

// Truncate removes old data before mint from the head and WAL.
func (h *Head) Truncate(mint int64) (err error) {
        initialized := h.initialized()
        if err := h.truncateMemory(mint); err != nil {
                return err
        }
        if !initialized {
                return nil
        }
        return h.truncateWAL(mint)
}

// OverlapsClosedInterval returns true if the head overlaps [mint, maxt].
func (h *Head) OverlapsClosedInterval(mint, maxt int64) bool {
        return h.MinTime() <= maxt && mint <= h.MaxTime()
}

// truncateMemory removes old data before mint from the head.
func (h *Head) truncateMemory(mint int64) (err error) {
        h.chunkSnapshotMtx.Lock()
        defer h.chunkSnapshotMtx.Unlock()

        defer func() {
                if err != nil {
                        h.metrics.headTruncateFail.Inc()
                }
        }()

        initialized := h.initialized()

        if h.MinTime() >= mint && initialized {
                return nil
        }

        // The order of these two Store() should not be changed,
        // i.e. truncation time is set before in-process boolean.
        h.lastMemoryTruncationTime.Store(mint)
        h.memTruncationInProcess.Store(true)
        defer h.memTruncationInProcess.Store(false)

        // We wait for pending queries to end that overlap with this truncation.
        if initialized {
                h.WaitForPendingReadersInTimeRange(h.MinTime(), mint)
        }

        h.minTime.Store(mint)
        h.minValidTime.Store(mint)

        // Ensure that max time is at least as high as min time.
        for h.MaxTime() < mint {
                h.maxTime.CompareAndSwap(h.MaxTime(), mint)
        }

        // This was an initial call to Truncate after loading blocks on startup.
        // We haven't read back the WAL yet, so do not attempt to truncate it.
        if !initialized {
                return nil
        }

        h.metrics.headTruncateTotal.Inc()
        return h.truncateSeriesAndChunkDiskMapper("truncateMemory")
}

// WaitForPendingReadersInTimeRange waits for queries overlapping with given range to finish querying.
// The query timeout limits the max wait time of this function implicitly.
// The mint is inclusive and maxt is the truncation time hence exclusive.
func (h *Head) WaitForPendingReadersInTimeRange(mint, maxt int64) {
        maxt-- // Making it inclusive before checking overlaps.
        overlaps := func() bool {
                o := false
                h.iso.TraverseOpenReads(func(s *isolationState) bool {
                        if s.mint <= maxt && mint <= s.maxt {
                                // Overlaps with the truncation range.
                                o = true
                                return false
                        }
                        return true
                })
                return o
        }
        for overlaps() {
                time.Sleep(500 * time.Millisecond)
        }
}

// WaitForPendingReadersForOOOChunksAtOrBefore is like WaitForPendingReadersInTimeRange, except it waits for
// queries touching OOO chunks less than or equal to chunk to finish querying.
func (h *Head) WaitForPendingReadersForOOOChunksAtOrBefore(chunk chunks.ChunkDiskMapperRef) {
        for h.oooIso.HasOpenReadsAtOrBefore(chunk) {
                time.Sleep(500 * time.Millisecond)
        }
}

// WaitForAppendersOverlapping waits for appends overlapping maxt to finish.
func (h *Head) WaitForAppendersOverlapping(maxt int64) {
        for maxt >= h.iso.lowestAppendTime() {
                time.Sleep(500 * time.Millisecond)
        }
}

// IsQuerierCollidingWithTruncation returns if the current querier needs to be closed and if a new querier
// has to be created. In the latter case, the method also returns the new mint to be used for creating the
// new range head and the new querier. This methods helps preventing races with the truncation of in-memory data.
//
// NOTE: The querier should already be taken before calling this.
func (h *Head) IsQuerierCollidingWithTruncation(querierMint, querierMaxt int64) (shouldClose, getNew bool, newMint int64) {
        if !h.memTruncationInProcess.Load() {
                return false, false, 0
        }
        // Head truncation is in process. It also means that the block that was
        // created for this truncation range is also available.
        // Check if we took a querier that overlaps with this truncation.
        memTruncTime := h.lastMemoryTruncationTime.Load()
        if querierMaxt < memTruncTime {
                // Head compaction has happened and this time range is being truncated.
                // This query doesn't overlap with the Head any longer.
                // We should close this querier to avoid races and the data would be
                // available with the blocks below.
                // Cases:
                // 1.     |------truncation------|
                //   |---query---|
                // 2.     |------truncation------|
                //              |---query---|
                return true, false, 0
        }
        if querierMint < memTruncTime {
                // The truncation time is not same as head mint that we saw above but the
                // query still overlaps with the Head.
                // The truncation started after we got the querier. So it is not safe
                // to use this querier and/or might block truncation. We should get
                // a new querier for the new Head range while remaining will be available
                // in the blocks below.
                // Case:
                //      |------truncation------|
                //                        |----query----|
                // Turns into
                //      |------truncation------|
                //                             |---qu---|
                return true, true, memTruncTime
        }

        // Other case is this, which is a no-op
        //      |------truncation------|
        //                              |---query---|
        return false, false, 0
}

// truncateWAL removes old data before mint from the WAL.
func (h *Head) truncateWAL(mint int64) error {
        h.chunkSnapshotMtx.Lock()
        defer h.chunkSnapshotMtx.Unlock()

        if h.wal == nil || mint <= h.lastWALTruncationTime.Load() {
                return nil
        }
        start := time.Now()
        h.lastWALTruncationTime.Store(mint)

        first, last, err := wlog.Segments(h.wal.Dir())
        if err != nil {
                return fmt.Errorf("get segment range: %w", err)
        }
        // Start a new segment, so low ingestion volume TSDB don't have more WAL than
        // needed.
        if _, err := h.wal.NextSegment(); err != nil {
                return fmt.Errorf("next segment: %w", err)
        }
        last-- // Never consider last segment for checkpoint.
        if last < 0 {
                return nil // no segments yet.
        }
        // The lower two thirds of segments should contain mostly obsolete samples.
        // If we have less than two segments, it's not worth checkpointing yet.
        // With the default 2h blocks, this will keeping up to around 3h worth
        // of WAL segments.
        last = first + (last-first)*2/3
        if last <= first {
                return nil
        }

        keep := func(id chunks.HeadSeriesRef) bool {
                if h.series.getByID(id) != nil {
                        return true
                }
                h.deletedMtx.Lock()
                keepUntil, ok := h.deleted[id]
                h.deletedMtx.Unlock()
                return ok && keepUntil > last
        }
        h.metrics.checkpointCreationTotal.Inc()
        if _, err = wlog.Checkpoint(h.logger, h.wal, first, last, keep, mint); err != nil {
                h.metrics.checkpointCreationFail.Inc()
                var cerr *chunks.CorruptionErr
                if errors.As(err, &cerr) {
                        h.metrics.walCorruptionsTotal.Inc()
                }
                return fmt.Errorf("create checkpoint: %w", err)
        }
        if err := h.wal.Truncate(last + 1); err != nil {
                // If truncating fails, we'll just try again at the next checkpoint.
                // Leftover segments will just be ignored in the future if there's a checkpoint
                // that supersedes them.
                level.Error(h.logger).Log("msg", "truncating segments failed", "err", err)
        }

        // The checkpoint is written and segments before it is truncated, so we no
        // longer need to track deleted series that are before it.
        h.deletedMtx.Lock()
        for ref, segment := range h.deleted {
                if segment <= last {
                        delete(h.deleted, ref)
                }
        }
        h.deletedMtx.Unlock()

        h.metrics.checkpointDeleteTotal.Inc()
        if err := wlog.DeleteCheckpoints(h.wal.Dir(), last); err != nil {
                // Leftover old checkpoints do not cause problems down the line beyond
                // occupying disk space.
                // They will just be ignored since a higher checkpoint exists.
                level.Error(h.logger).Log("msg", "delete old checkpoints", "err", err)
                h.metrics.checkpointDeleteFail.Inc()
        }
        h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())

        level.Info(h.logger).Log("msg", "WAL checkpoint complete",
                "first", first, "last", last, "duration", time.Since(start))

        return nil
}

// truncateOOO
//   - waits for any pending reads that potentially touch chunks less than or equal to newMinOOOMmapRef
//   - truncates the OOO WBL files whose index is strictly less than lastWBLFile.
//   - garbage collects all the m-map chunks from the memory that are less than or equal to newMinOOOMmapRef
//     and then deletes the series that do not have any data anymore.
//
// The caller is responsible for ensuring that no further queriers will be created that reference chunks less
// than or equal to newMinOOOMmapRef before calling truncateOOO.
func (h *Head) truncateOOO(lastWBLFile int, newMinOOOMmapRef chunks.ChunkDiskMapperRef) error {
        curMinOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())
        if newMinOOOMmapRef.GreaterThan(curMinOOOMmapRef) {
                h.WaitForPendingReadersForOOOChunksAtOrBefore(newMinOOOMmapRef)
                h.minOOOMmapRef.Store(uint64(newMinOOOMmapRef))

                if err := h.truncateSeriesAndChunkDiskMapper("truncateOOO"); err != nil {
                        return err
                }
        }

        if h.wbl == nil {
                return nil
        }

        return h.wbl.Truncate(lastWBLFile)
}

// truncateSeriesAndChunkDiskMapper is a helper function for truncateMemory and truncateOOO.
// It runs GC on the Head and truncates the ChunkDiskMapper accordingly.
func (h *Head) truncateSeriesAndChunkDiskMapper(caller string) error {
        start := time.Now()
        headMaxt := h.MaxTime()
        actualMint, minOOOTime, minMmapFile := h.gc()
        level.Info(h.logger).Log("msg", "Head GC completed", "caller", caller, "duration", time.Since(start))
        h.metrics.gcDuration.Observe(time.Since(start).Seconds())

        if actualMint > h.minTime.Load() {
                // The actual mint of the head is higher than the one asked to truncate.
                appendableMinValidTime := h.appendableMinValidTime()
                if actualMint < appendableMinValidTime {
                        h.minTime.Store(actualMint)
                        h.minValidTime.Store(actualMint)
                } else {
                        // The actual min time is in the appendable window.
                        // So we set the mint to the appendableMinValidTime.
                        h.minTime.Store(appendableMinValidTime)
                        h.minValidTime.Store(appendableMinValidTime)
                }
        }
        if headMaxt-h.opts.OutOfOrderTimeWindow.Load() < minOOOTime {
                // The allowed OOO window is lower than the min OOO time seen during GC.
                // So it is possible that some OOO sample was inserted that was less that minOOOTime.
                // So we play safe and set it to the min that was possible.
                minOOOTime = headMaxt - h.opts.OutOfOrderTimeWindow.Load()
        }
        h.minOOOTime.Store(minOOOTime)

        // Truncate the chunk m-mapper.
        if err := h.chunkDiskMapper.Truncate(uint32(minMmapFile)); err != nil {
                return fmt.Errorf("truncate chunks.HeadReadWriter by file number: %w", err)
        }
        return nil
}

type Stats struct {
        NumSeries         uint64
        MinTime, MaxTime  int64
        IndexPostingStats *index.PostingsStats
}

// Stats returns important current HEAD statistics. Note that it is expensive to
// calculate these.
func (h *Head) Stats(statsByLabelName string, limit int) *Stats {
        return &Stats{
                NumSeries:         h.NumSeries(),
                MaxTime:           h.MaxTime(),
                MinTime:           h.MinTime(),
                IndexPostingStats: h.PostingsCardinalityStats(statsByLabelName, limit),
        }
}

// RangeHead allows querying Head via an IndexReader, ChunkReader and tombstones.Reader
// but only within a restricted range.  Used for queries and compactions.
type RangeHead struct {
        head       *Head
        mint, maxt int64

        isolationOff bool
}

// NewRangeHead returns a *RangeHead.
// There are no restrictions on mint/maxt.
func NewRangeHead(head *Head, mint, maxt int64) *RangeHead {
        return &RangeHead{
                head: head,
                mint: mint,
                maxt: maxt,
        }
}

// NewRangeHeadWithIsolationDisabled returns a *RangeHead that does not create an isolationState.
func NewRangeHeadWithIsolationDisabled(head *Head, mint, maxt int64) *RangeHead {
        rh := NewRangeHead(head, mint, maxt)
        rh.isolationOff = true
        return rh
}

func (h *RangeHead) Index() (IndexReader, error) {
        return h.head.indexRange(h.mint, h.maxt), nil
}

func (h *RangeHead) Chunks() (ChunkReader, error) {
        var isoState *isolationState
        if !h.isolationOff {
                isoState = h.head.iso.State(h.mint, h.maxt)
        }
        return h.head.chunksRange(h.mint, h.maxt, isoState)
}

func (h *RangeHead) Tombstones() (tombstones.Reader, error) {
        return h.head.tombstones, nil
}

func (h *RangeHead) MinTime() int64 {
        return h.mint
}

// MaxTime returns the max time of actual data fetch-able from the head.
// This controls the chunks time range which is closed [b.MinTime, b.MaxTime].
func (h *RangeHead) MaxTime() int64 {
        return h.maxt
}

// BlockMaxTime returns the max time of the potential block created from this head.
// It's different to MaxTime as we need to add +1 millisecond to block maxt because block
// intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
func (h *RangeHead) BlockMaxTime() int64 {
        return h.MaxTime() + 1
}

func (h *RangeHead) NumSeries() uint64 {
        return h.head.NumSeries()
}

var rangeHeadULID = ulid.MustParse("0000000000XXXXXXXRANGEHEAD")

func (h *RangeHead) Meta() BlockMeta {
        return BlockMeta{
                MinTime: h.MinTime(),
                MaxTime: h.MaxTime(),
                ULID:    rangeHeadULID,
                Stats: BlockStats{
                        NumSeries: h.NumSeries(),
                },
        }
}

// String returns an human readable representation of the range head. It's important to
// keep this function in order to avoid the struct dump when the head is stringified in
// errors or logs.
func (h *RangeHead) String() string {
        return fmt.Sprintf("range head (mint: %d, maxt: %d)", h.MinTime(), h.MaxTime())
}

// Delete all samples in the range of [mint, maxt] for series that satisfy the given
// label matchers.
func (h *Head) Delete(ctx context.Context, mint, maxt int64, ms ...*labels.Matcher) error {
        // Do not delete anything beyond the currently valid range.
        mint, maxt = clampInterval(mint, maxt, h.MinTime(), h.MaxTime())

        ir := h.indexRange(mint, maxt)

        p, err := PostingsForMatchers(ctx, ir, ms...)
        if err != nil {
                return fmt.Errorf("select series: %w", err)
        }

        var stones []tombstones.Stone
        for p.Next() {
                if err := ctx.Err(); err != nil {
                        return fmt.Errorf("select series: %w", err)
                }

                series := h.series.getByID(chunks.HeadSeriesRef(p.At()))
                if series == nil {
                        level.Debug(h.logger).Log("msg", "Series not found in Head.Delete")
                        continue
                }

                series.Lock()
                t0, t1 := series.minTime(), series.maxTime()
                series.Unlock()
                if t0 == math.MinInt64 || t1 == math.MinInt64 {
                        continue
                }
                // Delete only until the current values and not beyond.
                t0, t1 = clampInterval(mint, maxt, t0, t1)
                stones = append(stones, tombstones.Stone{Ref: p.At(), Intervals: tombstones.Intervals{{Mint: t0, Maxt: t1}}})
        }
        if p.Err() != nil {
                return p.Err()
        }
        if err := ctx.Err(); err != nil {
                return fmt.Errorf("select series: %w", err)
        }

        if h.wal != nil {
                var enc record.Encoder
                if err := h.wal.Log(enc.Tombstones(stones, nil)); err != nil {
                        return err
                }
        }
        for _, s := range stones {
                h.tombstones.AddInterval(s.Ref, s.Intervals[0])
        }

        return nil
}

// gc removes data before the minimum timestamp from the head.
// It returns
// * The actual min times of the chunks present in the Head.
// * The min OOO time seen during the GC.
// * Min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
func (h *Head) gc() (actualInOrderMint, minOOOTime int64, minMmapFile int) {
        // Only data strictly lower than this timestamp must be deleted.
        mint := h.MinTime()
        // Only ooo m-map chunks strictly lower than or equal to this ref
        // must be deleted.
        minOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())

        // Drop old chunks and remember series IDs and hashes if they can be
        // deleted entirely.
        deleted, affected, chunksRemoved, actualInOrderMint, minOOOTime, minMmapFile := h.series.gc(mint, minOOOMmapRef)
        seriesRemoved := len(deleted)

        h.metrics.seriesRemoved.Add(float64(seriesRemoved))
        h.metrics.chunksRemoved.Add(float64(chunksRemoved))
        h.metrics.chunks.Sub(float64(chunksRemoved))
        h.numSeries.Sub(uint64(seriesRemoved))

        // Remove deleted series IDs from the postings lists.
        h.postings.Delete(deleted, affected)

        // Remove tombstones referring to the deleted series.
        h.tombstones.DeleteTombstones(deleted)
        h.tombstones.TruncateBefore(mint)

        if h.wal != nil {
                _, last, _ := wlog.Segments(h.wal.Dir())
                h.deletedMtx.Lock()
                // Keep series records until we're past segment 'last'
                // because the WAL will still have samples records with
                // this ref ID. If we didn't keep these series records then
                // on start up when we replay the WAL, or any other code
                // that reads the WAL, wouldn't be able to use those
                // samples since we would have no labels for that ref ID.
                for ref := range deleted {
                        h.deleted[chunks.HeadSeriesRef(ref)] = last
                }
                h.deletedMtx.Unlock()
        }

        return actualInOrderMint, minOOOTime, minMmapFile
}

// Tombstones returns a new reader over the head's tombstones.
func (h *Head) Tombstones() (tombstones.Reader, error) {
        return h.tombstones, nil
}

// NumSeries returns the number of active series in the head.
func (h *Head) NumSeries() uint64 {
        return h.numSeries.Load()
}

var headULID = ulid.MustParse("0000000000XXXXXXXXXXXXHEAD")

// Meta returns meta information about the head.
// The head is dynamic so will return dynamic results.
func (h *Head) Meta() BlockMeta {
        return BlockMeta{
                MinTime: h.MinTime(),
                MaxTime: h.MaxTime(),
                ULID:    headULID,
                Stats: BlockStats{
                        NumSeries: h.NumSeries(),
                },
        }
}

// MinTime returns the lowest time bound on visible data in the head.
func (h *Head) MinTime() int64 {
        return h.minTime.Load()
}

// MaxTime returns the highest timestamp seen in data of the head.
func (h *Head) MaxTime() int64 {
        return h.maxTime.Load()
}

// MinOOOTime returns the lowest time bound on visible data in the out of order
// head.
func (h *Head) MinOOOTime() int64 {
        return h.minOOOTime.Load()
}

// MaxOOOTime returns the highest timestamp on visible data in the out of order
// head.
func (h *Head) MaxOOOTime() int64 {
        return h.maxOOOTime.Load()
}

// initialized returns true if the head has a MinTime set, false otherwise.
func (h *Head) initialized() bool {
        return h.MinTime() != math.MaxInt64
}

// compactable returns whether the head has a compactable range.
// The head has a compactable range when the head time range is 1.5 times the chunk range.
// The 0.5 acts as a buffer of the appendable window.
func (h *Head) compactable() bool {
        if !h.initialized() {
                return false
        }

        return h.MaxTime()-h.MinTime() > h.chunkRange.Load()/2*3
}

// Close flushes the WAL and closes the head.
// It also takes a snapshot of in-memory chunks if enabled.
func (h *Head) Close() error {
        h.closedMtx.Lock()
        defer h.closedMtx.Unlock()
        h.closed = true

        // mmap all but last chunk in case we're performing snapshot since that only
        // takes samples from most recent head chunk.
        h.mmapHeadChunks()

        errs := tsdb_errors.NewMulti(h.chunkDiskMapper.Close())
        if h.wal != nil {
                errs.Add(h.wal.Close())
        }
        if h.wbl != nil {
                errs.Add(h.wbl.Close())
        }
        if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown {
                errs.Add(h.performChunkSnapshot())
        }
        return errs.Err()
}

// String returns an human readable representation of the TSDB head. It's important to
// keep this function in order to avoid the struct dump when the head is stringified in
// errors or logs.
func (h *Head) String() string {
        return "head"
}

func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, error) {
        // Just using `getOrCreateWithID` below would be semantically sufficient, but we'd create
        // a new series on every sample inserted via Add(), which causes allocations
        // and makes our series IDs rather random and harder to compress in postings.
        s := h.series.getByHash(hash, lset)
        if s != nil {
                return s, false, nil
        }

        // Optimistically assume that we are the first one to create the series.
        id := chunks.HeadSeriesRef(h.lastSeriesID.Inc())

        return h.getOrCreateWithID(id, hash, lset)
}

func (h *Head) getOrCreateWithID(id chunks.HeadSeriesRef, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
        s, created, err := h.series.getOrSet(hash, lset, func() *memSeries {
                shardHash := uint64(0)
                if h.opts.EnableSharding {
                        shardHash = labels.StableHash(lset)
                }

                return newMemSeries(lset, id, shardHash, h.opts.IsolationDisabled)
        })
        if err != nil {
                return nil, false, err
        }
        if !created {
                return s, false, nil
        }

        h.metrics.seriesCreated.Inc()
        h.numSeries.Inc()

        h.postings.Add(storage.SeriesRef(id), lset)
        return s, true, nil
}

// mmapHeadChunks will iterate all memSeries stored on Head and call mmapHeadChunks() on each of them.
//
// There are two types of chunks that store samples for each memSeries:
// A) Head chunk - stored on Go heap, when new samples are appended they go there.
// B) M-mapped chunks - memory mapped chunks, kernel manages the memory for us on-demand, these chunks
//
//        are read-only.
//
// Calling mmapHeadChunks() will iterate all memSeries and m-mmap all chunks that should be m-mapped.
// The m-mapping operation is needs to be serialised and so it goes via central lock.
// If there are multiple concurrent memSeries that need to m-map some chunk then they can block each-other.
//
// To minimise the effect of locking on TSDB operations m-mapping is serialised and done away from
// sample append path, since waiting on a lock inside an append would lock the entire memSeries for
// (potentially) a long time, since that could eventually delay next scrape and/or cause query timeouts.
func (h *Head) mmapHeadChunks() {
        var count int
        for i := 0; i < h.series.size; i++ {
                h.series.locks[i].RLock()
                for _, series := range h.series.series[i] {
                        series.Lock()
                        count += series.mmapChunks(h.chunkDiskMapper)
                        series.Unlock()
                }
                h.series.locks[i].RUnlock()
        }
        h.metrics.mmapChunksTotal.Add(float64(count))
}

// seriesHashmap lets TSDB find a memSeries by its label set, via a 64-bit hash.
// There is one map for the common case where the hash value is unique, and a
// second map for the case that two series have the same hash value.
// Each series is in only one of the maps.
// Its methods require the hash to be submitted with it to avoid re-computations throughout
// the code.
type seriesHashmap struct {
        unique    map[uint64]*memSeries
        conflicts map[uint64][]*memSeries
}

func (m *seriesHashmap) get(hash uint64, lset labels.Labels) *memSeries {
        if s, found := m.unique[hash]; found {
                if labels.Equal(s.lset, lset) {
                        return s
                }
        }
        for _, s := range m.conflicts[hash] {
                if labels.Equal(s.lset, lset) {
                        return s
                }
        }
        return nil
}

func (m *seriesHashmap) set(hash uint64, s *memSeries) {
        if existing, found := m.unique[hash]; !found || labels.Equal(existing.lset, s.lset) {
                m.unique[hash] = s
                return
        }
        if m.conflicts == nil {
                m.conflicts = make(map[uint64][]*memSeries)
        }
        l := m.conflicts[hash]
        for i, prev := range l {
                if labels.Equal(prev.lset, s.lset) {
                        l[i] = s
                        return
                }
        }
        m.conflicts[hash] = append(l, s)
}

func (m *seriesHashmap) del(hash uint64, ref chunks.HeadSeriesRef) {
        var rem []*memSeries
        unique, found := m.unique[hash]
        switch {
        case !found: // Supplied hash is not stored.
                return
        case unique.ref == ref:
                conflicts := m.conflicts[hash]
                if len(conflicts) == 0 { // Exactly one series with this hash was stored
                        delete(m.unique, hash)
                        return
                }
                m.unique[hash] = conflicts[0] // First remaining series goes in 'unique'.
                rem = conflicts[1:]           // Keep the rest.
        default: // The series to delete is somewhere in 'conflicts'. Keep all the ones that don't match.
                for _, s := range m.conflicts[hash] {
                        if s.ref != ref {
                                rem = append(rem, s)
                        }
                }
        }
        if len(rem) == 0 {
                delete(m.conflicts, hash)
        } else {
                m.conflicts[hash] = rem
        }
}

const (
        // DefaultStripeSize is the default number of entries to allocate in the stripeSeries hash map.
        DefaultStripeSize = 1 << 14
)

// stripeSeries holds series by HeadSeriesRef ("ID") and also by hash of their labels.
// ID-based lookups via getByID() are preferred over getByHash() for performance reasons.
// It locks modulo ranges of IDs and hashes to reduce lock contention.
// The locks are padded to not be on the same cache line. Filling the padded space
// with the maps was profiled to be slower – likely due to the additional pointer
// dereferences.
type stripeSeries struct {
        size                    int
        series                  []map[chunks.HeadSeriesRef]*memSeries // Sharded by ref. A series ref is the value of `size` when the series was being newly added.
        hashes                  []seriesHashmap                       // Sharded by label hash.
        locks                   []stripeLock                          // Sharded by ref for series access, by label hash for hashes access.
        seriesLifecycleCallback SeriesLifecycleCallback
}

type stripeLock struct {
        sync.RWMutex
        // Padding to avoid multiple locks being on the same cache line.
        _ [40]byte
}

func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *stripeSeries {
        s := &stripeSeries{
                size:                    stripeSize,
                series:                  make([]map[chunks.HeadSeriesRef]*memSeries, stripeSize),
                hashes:                  make([]seriesHashmap, stripeSize),
                locks:                   make([]stripeLock, stripeSize),
                seriesLifecycleCallback: seriesCallback,
        }

        for i := range s.series {
                s.series[i] = map[chunks.HeadSeriesRef]*memSeries{}
        }
        for i := range s.hashes {
                s.hashes[i] = seriesHashmap{
                        unique:    map[uint64]*memSeries{},
                        conflicts: nil, // Initialized on demand in set().
                }
        }
        return s
}

// gc garbage collects old chunks that are strictly before mint and removes
// series entirely that have no chunks left.
// note: returning map[chunks.HeadSeriesRef]struct{} would be more accurate,
// but the returned map goes into postings.Delete() which expects a map[storage.SeriesRef]struct
// and there's no easy way to cast maps.
// minMmapFile is the min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) (_ map[storage.SeriesRef]struct{}, _ map[labels.Label]struct{}, _ int, _, _ int64, minMmapFile int) {
        var (
                deleted          = map[storage.SeriesRef]struct{}{}
                affected         = map[labels.Label]struct{}{}
                rmChunks         = 0
                actualMint int64 = math.MaxInt64
                minOOOTime int64 = math.MaxInt64
        )
        minMmapFile = math.MaxInt32

        // For one series, truncate old chunks and check if any chunks left. If not, mark as deleted and collect the ID.
        check := func(hashShard int, hash uint64, series *memSeries, deletedForCallback map[chunks.HeadSeriesRef]labels.Labels) {
                series.Lock()
                defer series.Unlock()

                rmChunks += series.truncateChunksBefore(mint, minOOOMmapRef)

                if len(series.mmappedChunks) > 0 {
                        seq, _ := series.mmappedChunks[0].ref.Unpack()
                        if seq < minMmapFile {
                                minMmapFile = seq
                        }
                }
                if series.ooo != nil && len(series.ooo.oooMmappedChunks) > 0 {
                        seq, _ := series.ooo.oooMmappedChunks[0].ref.Unpack()
                        if seq < minMmapFile {
                                minMmapFile = seq
                        }
                        for _, ch := range series.ooo.oooMmappedChunks {
                                if ch.minTime < minOOOTime {
                                        minOOOTime = ch.minTime
                                }
                        }
                }
                if series.ooo != nil && series.ooo.oooHeadChunk != nil {
                        if series.ooo.oooHeadChunk.minTime < minOOOTime {
                                minOOOTime = series.ooo.oooHeadChunk.minTime
                        }
                }
                if len(series.mmappedChunks) > 0 || series.headChunks != nil || series.pendingCommit ||
                        (series.ooo != nil && (len(series.ooo.oooMmappedChunks) > 0 || series.ooo.oooHeadChunk != nil)) {
                        seriesMint := series.minTime()
                        if seriesMint < actualMint {
                                actualMint = seriesMint
                        }
                        return
                }
                // The series is gone entirely. We need to keep the series lock
                // and make sure we have acquired the stripe locks for hash and ID of the
                // series alike.
                // If we don't hold them all, there's a very small chance that a series receives
                // samples again while we are half-way into deleting it.
                refShard := int(series.ref) & (s.size - 1)
                if hashShard != refShard {
                        s.locks[refShard].Lock()
                        defer s.locks[refShard].Unlock()
                }

                deleted[storage.SeriesRef(series.ref)] = struct{}{}
                series.lset.Range(func(l labels.Label) { affected[l] = struct{}{} })
                s.hashes[hashShard].del(hash, series.ref)
                delete(s.series[refShard], series.ref)
                deletedForCallback[series.ref] = series.lset
        }

        s.iterForDeletion(check)

        if actualMint == math.MaxInt64 {
                actualMint = mint
        }

        return deleted, affected, rmChunks, actualMint, minOOOTime, minMmapFile
}

// The iterForDeletion function iterates through all series, invoking the checkDeletedFunc for each.
// The checkDeletedFunc takes a map as input and should add to it all series that were deleted and should be included
// when invoking the PostDeletion hook.
func (s *stripeSeries) iterForDeletion(checkDeletedFunc func(int, uint64, *memSeries, map[chunks.HeadSeriesRef]labels.Labels)) int {
        seriesSetFromPrevStripe := 0
        totalDeletedSeries := 0
        // Run through all series shard by shard
        for i := 0; i < s.size; i++ {
                seriesSet := make(map[chunks.HeadSeriesRef]labels.Labels, seriesSetFromPrevStripe)
                s.locks[i].Lock()
                // Iterate conflicts first so f doesn't move them to the `unique` field,
                // after deleting `unique`.
                for hash, all := range s.hashes[i].conflicts {
                        for _, series := range all {
                                checkDeletedFunc(i, hash, series, seriesSet)
                        }
                }

                for hash, series := range s.hashes[i].unique {
                        checkDeletedFunc(i, hash, series, seriesSet)
                }
                s.locks[i].Unlock()
                s.seriesLifecycleCallback.PostDeletion(seriesSet)
                totalDeletedSeries += len(seriesSet)
                seriesSetFromPrevStripe = len(seriesSet)
        }
        return totalDeletedSeries
}

func (s *stripeSeries) getByID(id chunks.HeadSeriesRef) *memSeries {
        i := uint64(id) & uint64(s.size-1)

        s.locks[i].RLock()
        series := s.series[i][id]
        s.locks[i].RUnlock()

        return series
}

func (s *stripeSeries) getByHash(hash uint64, lset labels.Labels) *memSeries {
        i := hash & uint64(s.size-1)

        s.locks[i].RLock()
        series := s.hashes[i].get(hash, lset)
        s.locks[i].RUnlock()

        return series
}

func (s *stripeSeries) getOrSet(hash uint64, lset labels.Labels, createSeries func() *memSeries) (*memSeries, bool, error) {
        // PreCreation is called here to avoid calling it inside the lock.
        // It is not necessary to call it just before creating a series,
        // rather it gives a 'hint' whether to create a series or not.
        preCreationErr := s.seriesLifecycleCallback.PreCreation(lset)

        // Create the series, unless the PreCreation() callback as failed.
        // If failed, we'll not allow to create a new series anyway.
        var series *memSeries
        if preCreationErr == nil {
                series = createSeries()
        }

        i := hash & uint64(s.size-1)
        s.locks[i].Lock()

        if prev := s.hashes[i].get(hash, lset); prev != nil {
                s.locks[i].Unlock()
                return prev, false, nil
        }
        if preCreationErr == nil {
                s.hashes[i].set(hash, series)
        }
        s.locks[i].Unlock()

        if preCreationErr != nil {
                // The callback prevented creation of series.
                return nil, false, preCreationErr
        }
        // Setting the series in the s.hashes marks the creation of series
        // as any further calls to this methods would return that series.
        s.seriesLifecycleCallback.PostCreation(series.lset)

        i = uint64(series.ref) & uint64(s.size-1)

        s.locks[i].Lock()
        s.series[i][series.ref] = series
        s.locks[i].Unlock()

        return series, true, nil
}

type sample struct {
        t  int64
        f  float64
        h  *histogram.Histogram
        fh *histogram.FloatHistogram
}

func newSample(t int64, v float64, h *histogram.Histogram, fh *histogram.FloatHistogram) chunks.Sample {
        return sample{t, v, h, fh}
}

func (s sample) T() int64                      { return s.t }
func (s sample) F() float64                    { return s.f }
func (s sample) H() *histogram.Histogram       { return s.h }
func (s sample) FH() *histogram.FloatHistogram { return s.fh }

func (s sample) Type() chunkenc.ValueType {
        switch {
        case s.h != nil:
                return chunkenc.ValHistogram
        case s.fh != nil:
                return chunkenc.ValFloatHistogram
        default:
                return chunkenc.ValFloat
        }
}

// memSeries is the in-memory representation of a series. None of its methods
// are goroutine safe and it is the caller's responsibility to lock it.
type memSeries struct {
        sync.Mutex

        ref  chunks.HeadSeriesRef
        lset labels.Labels
        meta *metadata.Metadata

        // Series labels hash to use for sharding purposes. The value is always 0 when sharding has not
        // been explicitly enabled in TSDB.
        shardHash uint64

        // Immutable chunks on disk that have not yet gone into a block, in order of ascending time stamps.
        // When compaction runs, chunks get moved into a block and all pointers are shifted like so:
        //
        //                                    /------- let's say these 2 chunks get stored into a block
        //                                    |  |
        // before compaction: mmappedChunks=[p5,p6,p7,p8,p9] firstChunkID=5
        //  after compaction: mmappedChunks=[p7,p8,p9]       firstChunkID=7
        //
        // pN is the pointer to the mmappedChunk referered to by HeadChunkID=N
        mmappedChunks []*mmappedChunk
        // Most recent chunks in memory that are still being built or waiting to be mmapped.
        // This is a linked list, headChunks points to the most recent chunk, headChunks.next points
        // to older chunk and so on.
        headChunks   *memChunk
        firstChunkID chunks.HeadChunkID // HeadChunkID for mmappedChunks[0]

        ooo *memSeriesOOOFields

        mmMaxTime int64 // Max time of any mmapped chunk, only used during WAL replay.

        nextAt                           int64 // Timestamp at which to cut the next chunk.
        histogramChunkHasComputedEndTime bool  // True if nextAt has been predicted for the current histograms chunk; false otherwise.

        // We keep the last value here (in addition to appending it to the chunk) so we can check for duplicates.
        lastValue float64

        // We keep the last histogram value here (in addition to appending it to the chunk) so we can check for duplicates.
        lastHistogramValue      *histogram.Histogram
        lastFloatHistogramValue *histogram.FloatHistogram

        // Current appender for the head chunk. Set when a new head chunk is cut.
        // It is nil only if headChunks is nil. E.g. if there was an appender that created a new series, but rolled back the commit
        // (the first sample would create a headChunk, hence appender, but rollback skipped it while the Append() call would create a series).
        app chunkenc.Appender

        // txs is nil if isolation is disabled.
        txs *txRing

        pendingCommit bool // Whether there are samples waiting to be committed to this series.
}

// memSeriesOOOFields contains the fields required by memSeries
// to handle out-of-order data.
type memSeriesOOOFields struct {
        oooMmappedChunks []*mmappedChunk    // Immutable chunks on disk containing OOO samples.
        oooHeadChunk     *oooHeadChunk      // Most recent chunk for ooo samples in memory that's still being built.
        firstOOOChunkID  chunks.HeadChunkID // HeadOOOChunkID for oooMmappedChunks[0].
}

func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, shardHash uint64, isolationDisabled bool) *memSeries {
        s := &memSeries{
                lset:      lset,
                ref:       id,
                nextAt:    math.MinInt64,
                shardHash: shardHash,
        }
        if !isolationDisabled {
                s.txs = newTxRing(0)
        }
        return s
}

func (s *memSeries) minTime() int64 {
        if len(s.mmappedChunks) > 0 {
                return s.mmappedChunks[0].minTime
        }
        if s.headChunks != nil {
                return s.headChunks.oldest().minTime
        }
        return math.MinInt64
}

func (s *memSeries) maxTime() int64 {
        // The highest timestamps will always be in the regular (non-OOO) chunks, even if OOO is enabled.
        if s.headChunks != nil {
                return s.headChunks.maxTime
        }
        if len(s.mmappedChunks) > 0 {
                return s.mmappedChunks[len(s.mmappedChunks)-1].maxTime
        }
        return math.MinInt64
}

// truncateChunksBefore removes all chunks from the series that
// have no timestamp at or after mint.
// Chunk IDs remain unchanged.
func (s *memSeries) truncateChunksBefore(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) int {
        var removedInOrder int
        if s.headChunks != nil {
                var i int
                var nextChk *memChunk
                chk := s.headChunks
                for chk != nil {
                        if chk.maxTime < mint {
                                // If any head chunk is truncated, we can truncate all mmapped chunks.
                                removedInOrder = chk.len() + len(s.mmappedChunks)
                                s.firstChunkID += chunks.HeadChunkID(removedInOrder)
                                if i == 0 {
                                        // This is the first chunk on the list so we need to remove the entire list.
                                        s.headChunks = nil
                                } else {
                                        // This is NOT the first chunk, unlink it from parent.
                                        nextChk.prev = nil
                                }
                                s.mmappedChunks = nil
                                break
                        }
                        nextChk = chk
                        chk = chk.prev
                        i++
                }
        }
        if len(s.mmappedChunks) > 0 {
                for i, c := range s.mmappedChunks {
                        if c.maxTime >= mint {
                                break
                        }
                        removedInOrder = i + 1
                }
                s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removedInOrder:]...)
                s.firstChunkID += chunks.HeadChunkID(removedInOrder)
        }

        var removedOOO int
        if s.ooo != nil && len(s.ooo.oooMmappedChunks) > 0 {
                for i, c := range s.ooo.oooMmappedChunks {
                        if c.ref.GreaterThan(minOOOMmapRef) {
                                break
                        }
                        removedOOO = i + 1
                }
                s.ooo.oooMmappedChunks = append(s.ooo.oooMmappedChunks[:0], s.ooo.oooMmappedChunks[removedOOO:]...)
                s.ooo.firstOOOChunkID += chunks.HeadChunkID(removedOOO)

                if len(s.ooo.oooMmappedChunks) == 0 && s.ooo.oooHeadChunk == nil {
                        s.ooo = nil
                }
        }

        return removedInOrder + removedOOO
}

// cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after
// acquiring lock.
func (s *memSeries) cleanupAppendIDsBelow(bound uint64) {
        if s.txs != nil {
                s.txs.cleanupAppendIDsBelow(bound)
        }
}

type memChunk struct {
        chunk            chunkenc.Chunk
        minTime, maxTime int64
        prev             *memChunk // Link to the previous element on the list.
}

// len returns the length of memChunk list, including the element it was called on.
func (mc *memChunk) len() (count int) {
        elem := mc
        for elem != nil {
                count++
                elem = elem.prev
        }
        return count
}

// oldest returns the oldest element on the list.
// For single element list this will be the same memChunk oldest() was called on.
func (mc *memChunk) oldest() (elem *memChunk) {
        elem = mc
        for elem.prev != nil {
                elem = elem.prev
        }
        return elem
}

// atOffset returns a memChunk that's Nth element on the linked list.
func (mc *memChunk) atOffset(offset int) (elem *memChunk) {
        if offset == 0 {
                return mc
        }
        if offset < 0 {
                return nil
        }

        var i int
        elem = mc
        for i < offset {
                i++
                elem = elem.prev
                if elem == nil {
                        break
                }
        }

        return elem
}

type oooHeadChunk struct {
        chunk            *OOOChunk
        minTime, maxTime int64 // can probably be removed and pulled out of the chunk instead
}

// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
func (mc *oooHeadChunk) OverlapsClosedInterval(mint, maxt int64) bool {
        return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
}

// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool {
        return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
}

func overlapsClosedInterval(mint1, maxt1, mint2, maxt2 int64) bool {
        return mint1 <= maxt2 && mint2 <= maxt1
}

// mmappedChunk describes a head chunk on disk that has been mmapped.
type mmappedChunk struct {
        ref              chunks.ChunkDiskMapperRef
        numSamples       uint16
        minTime, maxTime int64
}

// Returns true if the chunk overlaps [mint, maxt].
func (mc *mmappedChunk) OverlapsClosedInterval(mint, maxt int64) bool {
        return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
}

type noopSeriesLifecycleCallback struct{}

func (noopSeriesLifecycleCallback) PreCreation(labels.Labels) error                     { return nil }
func (noopSeriesLifecycleCallback) PostCreation(labels.Labels)                          {}
func (noopSeriesLifecycleCallback) PostDeletion(map[chunks.HeadSeriesRef]labels.Labels) {}

func (h *Head) Size() int64 {
        var walSize, wblSize int64
        if h.wal != nil {
                walSize, _ = h.wal.Size()
        }
        if h.wbl != nil {
                wblSize, _ = h.wbl.Size()
        }
        cdmSize, _ := h.chunkDiskMapper.Size()
        return walSize + wblSize + cdmSize
}

func (h *RangeHead) Size() int64 {
        return h.head.Size()
}

func (h *Head) startWALReplayStatus(startFrom, last int) {
        h.stats.WALReplayStatus.Lock()
        defer h.stats.WALReplayStatus.Unlock()

        h.stats.WALReplayStatus.Min = startFrom
        h.stats.WALReplayStatus.Max = last
        h.stats.WALReplayStatus.Current = startFrom
}

func (h *Head) updateWALReplayStatusRead(current int) {
        h.stats.WALReplayStatus.Lock()
        defer h.stats.WALReplayStatus.Unlock()

        h.stats.WALReplayStatus.Current = current
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "errors"
        "fmt"
        "math"

        "github.com/go-kit/log/level"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/metadata"
        "github.com/prometheus/prometheus/model/value"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        "github.com/prometheus/prometheus/tsdb/record"
)

// initAppender is a helper to initialize the time bounds of the head
// upon the first sample it receives.
type initAppender struct {
        app  storage.Appender
        head *Head
}

var _ storage.GetRef = &initAppender{}

func (a *initAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64, v float64) (storage.SeriesRef, error) {
        if a.app != nil {
                return a.app.Append(ref, lset, t, v)
        }

        a.head.initTime(t)
        a.app = a.head.appender()
        return a.app.Append(ref, lset, t, v)
}

func (a *initAppender) AppendExemplar(ref storage.SeriesRef, l labels.Labels, e exemplar.Exemplar) (storage.SeriesRef, error) {
        // Check if exemplar storage is enabled.
        if !a.head.opts.EnableExemplarStorage || a.head.opts.MaxExemplars.Load() <= 0 {
                return 0, nil
        }

        if a.app != nil {
                return a.app.AppendExemplar(ref, l, e)
        }
        // We should never reach here given we would call Append before AppendExemplar
        // and we probably want to always base head/WAL min time on sample times.
        a.head.initTime(e.Ts)
        a.app = a.head.appender()

        return a.app.AppendExemplar(ref, l, e)
}

func (a *initAppender) AppendHistogram(ref storage.SeriesRef, l labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) {
        if a.app != nil {
                return a.app.AppendHistogram(ref, l, t, h, fh)
        }
        a.head.initTime(t)
        a.app = a.head.appender()

        return a.app.AppendHistogram(ref, l, t, h, fh)
}

func (a *initAppender) UpdateMetadata(ref storage.SeriesRef, l labels.Labels, m metadata.Metadata) (storage.SeriesRef, error) {
        if a.app != nil {
                return a.app.UpdateMetadata(ref, l, m)
        }

        a.app = a.head.appender()
        return a.app.UpdateMetadata(ref, l, m)
}

func (a *initAppender) AppendCTZeroSample(ref storage.SeriesRef, lset labels.Labels, t, ct int64) (storage.SeriesRef, error) {
        if a.app != nil {
                return a.app.AppendCTZeroSample(ref, lset, t, ct)
        }

        a.head.initTime(t)
        a.app = a.head.appender()

        return a.app.AppendCTZeroSample(ref, lset, t, ct)
}

// initTime initializes a head with the first timestamp. This only needs to be called
// for a completely fresh head with an empty WAL.
func (h *Head) initTime(t int64) {
        if !h.minTime.CompareAndSwap(math.MaxInt64, t) {
                return
        }
        // Ensure that max time is initialized to at least the min time we just set.
        // Concurrent appenders may already have set it to a higher value.
        h.maxTime.CompareAndSwap(math.MinInt64, t)
}

func (a *initAppender) GetRef(lset labels.Labels, hash uint64) (storage.SeriesRef, labels.Labels) {
        if g, ok := a.app.(storage.GetRef); ok {
                return g.GetRef(lset, hash)
        }
        return 0, labels.EmptyLabels()
}

func (a *initAppender) Commit() error {
        if a.app == nil {
                a.head.metrics.activeAppenders.Dec()
                return nil
        }
        return a.app.Commit()
}

func (a *initAppender) Rollback() error {
        if a.app == nil {
                a.head.metrics.activeAppenders.Dec()
                return nil
        }
        return a.app.Rollback()
}

// Appender returns a new Appender on the database.
func (h *Head) Appender(_ context.Context) storage.Appender {
        h.metrics.activeAppenders.Inc()

        // The head cache might not have a starting point yet. The init appender
        // picks up the first appended timestamp as the base.
        if !h.initialized() {
                return &initAppender{
                        head: h,
                }
        }
        return h.appender()
}

func (h *Head) appender() *headAppender {
        minValidTime := h.appendableMinValidTime()
        appendID, cleanupAppendIDsBelow := h.iso.newAppendID(minValidTime) // Every appender gets an ID that is cleared upon commit/rollback.

        // Allocate the exemplars buffer only if exemplars are enabled.
        var exemplarsBuf []exemplarWithSeriesRef
        if h.opts.EnableExemplarStorage {
                exemplarsBuf = h.getExemplarBuffer()
        }

        return &headAppender{
                head:                  h,
                minValidTime:          minValidTime,
                mint:                  math.MaxInt64,
                maxt:                  math.MinInt64,
                headMaxt:              h.MaxTime(),
                oooTimeWindow:         h.opts.OutOfOrderTimeWindow.Load(),
                samples:               h.getAppendBuffer(),
                sampleSeries:          h.getSeriesBuffer(),
                exemplars:             exemplarsBuf,
                histograms:            h.getHistogramBuffer(),
                floatHistograms:       h.getFloatHistogramBuffer(),
                metadata:              h.getMetadataBuffer(),
                appendID:              appendID,
                cleanupAppendIDsBelow: cleanupAppendIDsBelow,
        }
}

// appendableMinValidTime returns the minimum valid timestamp for appends,
// such that samples stay ahead of prior blocks and the head compaction window.
func (h *Head) appendableMinValidTime() int64 {
        // This boundary ensures that no samples will be added to the compaction window.
        // This allows race-free, concurrent appending and compaction.
        cwEnd := h.MaxTime() - h.chunkRange.Load()/2

        // This boundary ensures that we avoid overlapping timeframes from one block to the next.
        // While not necessary for correctness, it means we're not required to use vertical compaction.
        minValid := h.minValidTime.Load()

        return max(cwEnd, minValid)
}

// AppendableMinValidTime returns the minimum valid time for samples to be appended to the Head.
// Returns false if Head hasn't been initialized yet and the minimum time isn't known yet.
func (h *Head) AppendableMinValidTime() (int64, bool) {
        if !h.initialized() {
                return 0, false
        }

        return h.appendableMinValidTime(), true
}

func (h *Head) getAppendBuffer() []record.RefSample {
        b := h.appendPool.Get()
        if b == nil {
                return make([]record.RefSample, 0, 512)
        }
        return b
}

func (h *Head) putAppendBuffer(b []record.RefSample) {
        h.appendPool.Put(b[:0])
}

func (h *Head) getExemplarBuffer() []exemplarWithSeriesRef {
        b := h.exemplarsPool.Get()
        if b == nil {
                return make([]exemplarWithSeriesRef, 0, 512)
        }
        return b
}

func (h *Head) putExemplarBuffer(b []exemplarWithSeriesRef) {
        if b == nil {
                return
        }
        for i := range b { // Zero out to avoid retaining label data.
                b[i].exemplar.Labels = labels.EmptyLabels()
        }

        h.exemplarsPool.Put(b[:0])
}

func (h *Head) getHistogramBuffer() []record.RefHistogramSample {
        b := h.histogramsPool.Get()
        if b == nil {
                return make([]record.RefHistogramSample, 0, 512)
        }
        return b
}

func (h *Head) putHistogramBuffer(b []record.RefHistogramSample) {
        h.histogramsPool.Put(b[:0])
}

func (h *Head) getFloatHistogramBuffer() []record.RefFloatHistogramSample {
        b := h.floatHistogramsPool.Get()
        if b == nil {
                return make([]record.RefFloatHistogramSample, 0, 512)
        }
        return b
}

func (h *Head) putFloatHistogramBuffer(b []record.RefFloatHistogramSample) {
        h.floatHistogramsPool.Put(b[:0])
}

func (h *Head) getMetadataBuffer() []record.RefMetadata {
        b := h.metadataPool.Get()
        if b == nil {
                return make([]record.RefMetadata, 0, 512)
        }
        return b
}

func (h *Head) putMetadataBuffer(b []record.RefMetadata) {
        h.metadataPool.Put(b[:0])
}

func (h *Head) getSeriesBuffer() []*memSeries {
        b := h.seriesPool.Get()
        if b == nil {
                return make([]*memSeries, 0, 512)
        }
        return b
}

func (h *Head) putSeriesBuffer(b []*memSeries) {
        for i := range b { // Zero out to avoid retaining data.
                b[i] = nil
        }
        h.seriesPool.Put(b[:0])
}

func (h *Head) getBytesBuffer() []byte {
        b := h.bytesPool.Get()
        if b == nil {
                return make([]byte, 0, 1024)
        }
        return b
}

func (h *Head) putBytesBuffer(b []byte) {
        h.bytesPool.Put(b[:0])
}

type exemplarWithSeriesRef struct {
        ref      storage.SeriesRef
        exemplar exemplar.Exemplar
}

type headAppender struct {
        head          *Head
        minValidTime  int64 // No samples below this timestamp are allowed.
        mint, maxt    int64
        headMaxt      int64 // We track it here to not take the lock for every sample appended.
        oooTimeWindow int64 // Use the same for the entire append, and don't load the atomic for each sample.

        series               []record.RefSeries               // New series held by this appender.
        samples              []record.RefSample               // New float samples held by this appender.
        sampleSeries         []*memSeries                     // Float series corresponding to the samples held by this appender (using corresponding slice indices - same series may appear more than once).
        histograms           []record.RefHistogramSample      // New histogram samples held by this appender.
        histogramSeries      []*memSeries                     // HistogramSamples series corresponding to the samples held by this appender (using corresponding slice indices - same series may appear more than once).
        floatHistograms      []record.RefFloatHistogramSample // New float histogram samples held by this appender.
        floatHistogramSeries []*memSeries                     // FloatHistogramSamples series corresponding to the samples held by this appender (using corresponding slice indices - same series may appear more than once).
        metadata             []record.RefMetadata             // New metadata held by this appender.
        metadataSeries       []*memSeries                     // Series corresponding to the metadata held by this appender.
        exemplars            []exemplarWithSeriesRef          // New exemplars held by this appender.

        appendID, cleanupAppendIDsBelow uint64
        closed                          bool
}

func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64, v float64) (storage.SeriesRef, error) {
        // For OOO inserts, this restriction is irrelevant and will be checked later once we confirm the sample is an in-order append.
        // If OOO inserts are disabled, we may as well as check this as early as we can and avoid more work.
        if a.oooTimeWindow == 0 && t < a.minValidTime {
                a.head.metrics.outOfBoundSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
                return 0, storage.ErrOutOfBounds
        }

        s := a.head.series.getByID(chunks.HeadSeriesRef(ref))
        if s == nil {
                var err error
                s, err = a.getOrCreate(lset)
                if err != nil {
                        return 0, err
                }
        }

        if value.IsStaleNaN(v) {
                switch {
                case s.lastHistogramValue != nil:
                        return a.AppendHistogram(ref, lset, t, &histogram.Histogram{Sum: v}, nil)
                case s.lastFloatHistogramValue != nil:
                        return a.AppendHistogram(ref, lset, t, nil, &histogram.FloatHistogram{Sum: v})
                }
        }

        s.Lock()
        // TODO(codesome): If we definitely know at this point that the sample is ooo, then optimise
        // to skip that sample from the WAL and write only in the WBL.
        _, delta, err := s.appendable(t, v, a.headMaxt, a.minValidTime, a.oooTimeWindow)
        if err == nil {
                s.pendingCommit = true
        }
        s.Unlock()
        if delta > 0 {
                a.head.metrics.oooHistogram.Observe(float64(delta) / 1000)
        }
        if err != nil {
                switch {
                case errors.Is(err, storage.ErrOutOfOrderSample):
                        a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
                case errors.Is(err, storage.ErrTooOldSample):
                        a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeFloat).Inc()
                }
                return 0, err
        }

        if t < a.mint {
                a.mint = t
        }
        if t > a.maxt {
                a.maxt = t
        }

        a.samples = append(a.samples, record.RefSample{
                Ref: s.ref,
                T:   t,
                V:   v,
        })
        a.sampleSeries = append(a.sampleSeries, s)
        return storage.SeriesRef(s.ref), nil
}

// AppendCTZeroSample appends synthetic zero sample for ct timestamp. It returns
// error when sample can't be appended. See
// storage.CreatedTimestampAppender.AppendCTZeroSample for further documentation.
func (a *headAppender) AppendCTZeroSample(ref storage.SeriesRef, lset labels.Labels, t, ct int64) (storage.SeriesRef, error) {
        if ct >= t {
                return 0, fmt.Errorf("CT is newer or the same as sample's timestamp, ignoring")
        }

        s := a.head.series.getByID(chunks.HeadSeriesRef(ref))
        if s == nil {
                var err error
                s, err = a.getOrCreate(lset)
                if err != nil {
                        return 0, err
                }
        }

        // Check if CT wouldn't be OOO vs samples we already might have for this series.
        // NOTE(bwplotka): This will be often hit as it's expected for long living
        // counters to share the same CT.
        s.Lock()
        isOOO, _, err := s.appendable(ct, 0, a.headMaxt, a.minValidTime, a.oooTimeWindow)
        if err == nil {
                s.pendingCommit = true
        }
        s.Unlock()
        if err != nil {
                return 0, err
        }
        if isOOO {
                return storage.SeriesRef(s.ref), storage.ErrOutOfOrderCT
        }

        if ct > a.maxt {
                a.maxt = ct
        }
        a.samples = append(a.samples, record.RefSample{Ref: s.ref, T: ct, V: 0.0})
        a.sampleSeries = append(a.sampleSeries, s)
        return storage.SeriesRef(s.ref), nil
}

func (a *headAppender) getOrCreate(lset labels.Labels) (*memSeries, error) {
        // Ensure no empty labels have gotten through.
        lset = lset.WithoutEmpty()
        if lset.IsEmpty() {
                return nil, fmt.Errorf("empty labelset: %w", ErrInvalidSample)
        }
        if l, dup := lset.HasDuplicateLabelNames(); dup {
                return nil, fmt.Errorf(`label name "%s" is not unique: %w`, l, ErrInvalidSample)
        }
        var created bool
        var err error
        s, created, err := a.head.getOrCreate(lset.Hash(), lset)
        if err != nil {
                return nil, err
        }
        if created {
                a.series = append(a.series, record.RefSeries{
                        Ref:    s.ref,
                        Labels: lset,
                })
        }
        return s, nil
}

// appendable checks whether the given sample is valid for appending to the series. (if we return false and no error)
// The sample belongs to the out of order chunk if we return true and no error.
// An error signifies the sample cannot be handled.
func (s *memSeries) appendable(t int64, v float64, headMaxt, minValidTime, oooTimeWindow int64) (isOOO bool, oooDelta int64, err error) {
        // Check if we can append in the in-order chunk.
        if t >= minValidTime {
                if s.headChunks == nil {
                        // The series has no sample and was freshly created.
                        return false, 0, nil
                }
                msMaxt := s.maxTime()
                if t > msMaxt {
                        return false, 0, nil
                }
                if t == msMaxt {
                        // We are allowing exact duplicates as we can encounter them in valid cases
                        // like federation and erroring out at that time would be extremely noisy.
                        // This only checks against the latest in-order sample.
                        // The OOO headchunk has its own method to detect these duplicates.
                        if math.Float64bits(s.lastValue) != math.Float64bits(v) {
                                return false, 0, storage.NewDuplicateFloatErr(t, s.lastValue, v)
                        }
                        // Sample is identical (ts + value) with most current (highest ts) sample in sampleBuf.
                        return false, 0, nil
                }
        }

        // The sample cannot go in the in-order chunk. Check if it can go in the out-of-order chunk.
        if oooTimeWindow > 0 && t >= headMaxt-oooTimeWindow {
                return true, headMaxt - t, nil
        }

        // The sample cannot go in both in-order and out-of-order chunk.
        if oooTimeWindow > 0 {
                return true, headMaxt - t, storage.ErrTooOldSample
        }
        if t < minValidTime {
                return false, headMaxt - t, storage.ErrOutOfBounds
        }
        return false, headMaxt - t, storage.ErrOutOfOrderSample
}

// appendableHistogram checks whether the given histogram is valid for appending to the series.
func (s *memSeries) appendableHistogram(t int64, h *histogram.Histogram) error {
        if s.headChunks == nil {
                return nil
        }

        if t > s.headChunks.maxTime {
                return nil
        }
        if t < s.headChunks.maxTime {
                return storage.ErrOutOfOrderSample
        }

        // We are allowing exact duplicates as we can encounter them in valid cases
        // like federation and erroring out at that time would be extremely noisy.
        if !h.Equals(s.lastHistogramValue) {
                return storage.ErrDuplicateSampleForTimestamp
        }
        return nil
}

// appendableFloatHistogram checks whether the given float histogram is valid for appending to the series.
func (s *memSeries) appendableFloatHistogram(t int64, fh *histogram.FloatHistogram) error {
        if s.headChunks == nil {
                return nil
        }

        if t > s.headChunks.maxTime {
                return nil
        }
        if t < s.headChunks.maxTime {
                return storage.ErrOutOfOrderSample
        }

        // We are allowing exact duplicates as we can encounter them in valid cases
        // like federation and erroring out at that time would be extremely noisy.
        if !fh.Equals(s.lastFloatHistogramValue) {
                return storage.ErrDuplicateSampleForTimestamp
        }
        return nil
}

// AppendExemplar for headAppender assumes the series ref already exists, and so it doesn't
// use getOrCreate or make any of the lset validity checks that Append does.
func (a *headAppender) AppendExemplar(ref storage.SeriesRef, lset labels.Labels, e exemplar.Exemplar) (storage.SeriesRef, error) {
        // Check if exemplar storage is enabled.
        if !a.head.opts.EnableExemplarStorage || a.head.opts.MaxExemplars.Load() <= 0 {
                return 0, nil
        }

        // Get Series
        s := a.head.series.getByID(chunks.HeadSeriesRef(ref))
        if s == nil {
                s = a.head.series.getByHash(lset.Hash(), lset)
                if s != nil {
                        ref = storage.SeriesRef(s.ref)
                }
        }
        if s == nil {
                return 0, fmt.Errorf("unknown HeadSeriesRef when trying to add exemplar: %d", ref)
        }

        // Ensure no empty labels have gotten through.
        e.Labels = e.Labels.WithoutEmpty()

        err := a.head.exemplars.ValidateExemplar(s.lset, e)
        if err != nil {
                if errors.Is(err, storage.ErrDuplicateExemplar) || errors.Is(err, storage.ErrExemplarsDisabled) {
                        // Duplicate, don't return an error but don't accept the exemplar.
                        return 0, nil
                }
                return 0, err
        }

        a.exemplars = append(a.exemplars, exemplarWithSeriesRef{ref, e})

        return storage.SeriesRef(s.ref), nil
}

func (a *headAppender) AppendHistogram(ref storage.SeriesRef, lset labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) {
        if !a.head.opts.EnableNativeHistograms.Load() {
                return 0, storage.ErrNativeHistogramsDisabled
        }

        if t < a.minValidTime {
                a.head.metrics.outOfBoundSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
                return 0, storage.ErrOutOfBounds
        }

        if h != nil {
                if err := h.Validate(); err != nil {
                        return 0, err
                }
        }

        if fh != nil {
                if err := fh.Validate(); err != nil {
                        return 0, err
                }
        }

        s := a.head.series.getByID(chunks.HeadSeriesRef(ref))
        if s == nil {
                // Ensure no empty labels have gotten through.
                lset = lset.WithoutEmpty()
                if lset.IsEmpty() {
                        return 0, fmt.Errorf("empty labelset: %w", ErrInvalidSample)
                }

                if l, dup := lset.HasDuplicateLabelNames(); dup {
                        return 0, fmt.Errorf(`label name "%s" is not unique: %w`, l, ErrInvalidSample)
                }

                var created bool
                var err error
                s, created, err = a.head.getOrCreate(lset.Hash(), lset)
                if err != nil {
                        return 0, err
                }
                if created {
                        switch {
                        case h != nil:
                                s.lastHistogramValue = &histogram.Histogram{}
                        case fh != nil:
                                s.lastFloatHistogramValue = &histogram.FloatHistogram{}
                        }
                        a.series = append(a.series, record.RefSeries{
                                Ref:    s.ref,
                                Labels: lset,
                        })
                }
        }

        switch {
        case h != nil:
                s.Lock()
                if err := s.appendableHistogram(t, h); err != nil {
                        s.Unlock()
                        if errors.Is(err, storage.ErrOutOfOrderSample) {
                                a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
                        }
                        return 0, err
                }
                s.pendingCommit = true
                s.Unlock()
                a.histograms = append(a.histograms, record.RefHistogramSample{
                        Ref: s.ref,
                        T:   t,
                        H:   h,
                })
                a.histogramSeries = append(a.histogramSeries, s)
        case fh != nil:
                s.Lock()
                if err := s.appendableFloatHistogram(t, fh); err != nil {
                        s.Unlock()
                        if errors.Is(err, storage.ErrOutOfOrderSample) {
                                a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeHistogram).Inc()
                        }
                        return 0, err
                }
                s.pendingCommit = true
                s.Unlock()
                a.floatHistograms = append(a.floatHistograms, record.RefFloatHistogramSample{
                        Ref: s.ref,
                        T:   t,
                        FH:  fh,
                })
                a.floatHistogramSeries = append(a.floatHistogramSeries, s)
        }

        if t < a.mint {
                a.mint = t
        }
        if t > a.maxt {
                a.maxt = t
        }

        return storage.SeriesRef(s.ref), nil
}

// UpdateMetadata for headAppender assumes the series ref already exists, and so it doesn't
// use getOrCreate or make any of the lset sanity checks that Append does.
func (a *headAppender) UpdateMetadata(ref storage.SeriesRef, lset labels.Labels, meta metadata.Metadata) (storage.SeriesRef, error) {
        s := a.head.series.getByID(chunks.HeadSeriesRef(ref))
        if s == nil {
                s = a.head.series.getByHash(lset.Hash(), lset)
                if s != nil {
                        ref = storage.SeriesRef(s.ref)
                }
        }
        if s == nil {
                return 0, fmt.Errorf("unknown series when trying to add metadata with HeadSeriesRef: %d and labels: %s", ref, lset)
        }

        s.Lock()
        hasNewMetadata := s.meta == nil || *s.meta != meta
        s.Unlock()

        if hasNewMetadata {
                a.metadata = append(a.metadata, record.RefMetadata{
                        Ref:  s.ref,
                        Type: record.GetMetricType(meta.Type),
                        Unit: meta.Unit,
                        Help: meta.Help,
                })
                a.metadataSeries = append(a.metadataSeries, s)
        }

        return ref, nil
}

var _ storage.GetRef = &headAppender{}

func (a *headAppender) GetRef(lset labels.Labels, hash uint64) (storage.SeriesRef, labels.Labels) {
        s := a.head.series.getByHash(hash, lset)
        if s == nil {
                return 0, labels.EmptyLabels()
        }
        // returned labels must be suitable to pass to Append()
        return storage.SeriesRef(s.ref), s.lset
}

// log writes all headAppender's data to the WAL.
func (a *headAppender) log() error {
        if a.head.wal == nil {
                return nil
        }

        buf := a.head.getBytesBuffer()
        defer func() { a.head.putBytesBuffer(buf) }()

        var rec []byte
        var enc record.Encoder

        if len(a.series) > 0 {
                rec = enc.Series(a.series, buf)
                buf = rec[:0]

                if err := a.head.wal.Log(rec); err != nil {
                        return fmt.Errorf("log series: %w", err)
                }
        }
        if len(a.metadata) > 0 {
                rec = enc.Metadata(a.metadata, buf)
                buf = rec[:0]

                if err := a.head.wal.Log(rec); err != nil {
                        return fmt.Errorf("log metadata: %w", err)
                }
        }
        if len(a.samples) > 0 {
                rec = enc.Samples(a.samples, buf)
                buf = rec[:0]

                if err := a.head.wal.Log(rec); err != nil {
                        return fmt.Errorf("log samples: %w", err)
                }
        }
        if len(a.histograms) > 0 {
                rec = enc.HistogramSamples(a.histograms, buf)
                buf = rec[:0]
                if err := a.head.wal.Log(rec); err != nil {
                        return fmt.Errorf("log histograms: %w", err)
                }
        }
        if len(a.floatHistograms) > 0 {
                rec = enc.FloatHistogramSamples(a.floatHistograms, buf)
                buf = rec[:0]
                if err := a.head.wal.Log(rec); err != nil {
                        return fmt.Errorf("log float histograms: %w", err)
                }
        }
        // Exemplars should be logged after samples (float/native histogram/etc),
        // otherwise it might happen that we send the exemplars in a remote write
        // batch before the samples, which in turn means the exemplar is rejected
        // for missing series, since series are created due to samples.
        if len(a.exemplars) > 0 {
                rec = enc.Exemplars(exemplarsForEncoding(a.exemplars), buf)
                buf = rec[:0]

                if err := a.head.wal.Log(rec); err != nil {
                        return fmt.Errorf("log exemplars: %w", err)
                }
        }
        return nil
}

func exemplarsForEncoding(es []exemplarWithSeriesRef) []record.RefExemplar {
        ret := make([]record.RefExemplar, 0, len(es))
        for _, e := range es {
                ret = append(ret, record.RefExemplar{
                        Ref:    chunks.HeadSeriesRef(e.ref),
                        T:      e.exemplar.Ts,
                        V:      e.exemplar.Value,
                        Labels: e.exemplar.Labels,
                })
        }
        return ret
}

// Commit writes to the WAL and adds the data to the Head.
// TODO(codesome): Refactor this method to reduce indentation and make it more readable.
func (a *headAppender) Commit() (err error) {
        if a.closed {
                return ErrAppenderClosed
        }
        defer func() { a.closed = true }()

        if err := a.log(); err != nil {
                _ = a.Rollback() // Most likely the same error will happen again.
                return fmt.Errorf("write to WAL: %w", err)
        }

        if a.head.writeNotified != nil {
                a.head.writeNotified.Notify()
        }

        // No errors logging to WAL, so pass the exemplars along to the in memory storage.
        for _, e := range a.exemplars {
                s := a.head.series.getByID(chunks.HeadSeriesRef(e.ref))
                if s == nil {
                        // This is very unlikely to happen, but we have seen it in the wild.
                        // It means that the series was truncated between AppendExemplar and Commit.
                        // See TestHeadCompactionWhileAppendAndCommitExemplar.
                        continue
                }
                // We don't instrument exemplar appends here, all is instrumented by storage.
                if err := a.head.exemplars.AddExemplar(s.lset, e.exemplar); err != nil {
                        if errors.Is(err, storage.ErrOutOfOrderExemplar) {
                                continue
                        }
                        level.Debug(a.head.logger).Log("msg", "Unknown error while adding exemplar", "err", err)
                }
        }

        defer a.head.metrics.activeAppenders.Dec()
        defer a.head.putAppendBuffer(a.samples)
        defer a.head.putSeriesBuffer(a.sampleSeries)
        defer a.head.putExemplarBuffer(a.exemplars)
        defer a.head.putHistogramBuffer(a.histograms)
        defer a.head.putFloatHistogramBuffer(a.floatHistograms)
        defer a.head.putMetadataBuffer(a.metadata)
        defer a.head.iso.closeAppend(a.appendID)

        var (
                floatsAppended     = len(a.samples)
                histogramsAppended = len(a.histograms) + len(a.floatHistograms)
                // number of samples out of order but accepted: with ooo enabled and within time window
                floatOOOAccepted int
                // number of samples rejected due to: out of order but OOO support disabled.
                floatOOORejected int
                histoOOORejected int
                // number of samples rejected due to: that are out of order but too old (OOO support enabled, but outside time window)
                floatTooOldRejected int
                // number of samples rejected due to: out of bounds: with t < minValidTime (OOO support disabled)
                floatOOBRejected int

                inOrderMint     int64 = math.MaxInt64
                inOrderMaxt     int64 = math.MinInt64
                ooomint         int64 = math.MaxInt64
                ooomaxt         int64 = math.MinInt64
                wblSamples      []record.RefSample
                oooMmapMarkers  map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef
                oooRecords      [][]byte
                oooCapMax       = a.head.opts.OutOfOrderCapMax.Load()
                series          *memSeries
                appendChunkOpts = chunkOpts{
                        chunkDiskMapper: a.head.chunkDiskMapper,
                        chunkRange:      a.head.chunkRange.Load(),
                        samplesPerChunk: a.head.opts.SamplesPerChunk,
                }
                enc record.Encoder
        )
        defer func() {
                for i := range oooRecords {
                        a.head.putBytesBuffer(oooRecords[i][:0])
                }
        }()
        collectOOORecords := func() {
                if a.head.wbl == nil {
                        // WBL is not enabled. So no need to collect.
                        wblSamples = nil
                        oooMmapMarkers = nil
                        return
                }
                // The m-map happens before adding a new sample. So we collect
                // the m-map markers first, and then samples.
                // WBL Graphically:
                //   WBL Before this Commit(): [old samples before this commit for chunk 1]
                //   WBL After this Commit():  [old samples before this commit for chunk 1][new samples in this commit for chunk 1]mmapmarker1[samples for chunk 2]mmapmarker2[samples for chunk 3]
                if oooMmapMarkers != nil {
                        markers := make([]record.RefMmapMarker, 0, len(oooMmapMarkers))
                        for ref, mmapRef := range oooMmapMarkers {
                                markers = append(markers, record.RefMmapMarker{
                                        Ref:     ref,
                                        MmapRef: mmapRef,
                                })
                        }
                        r := enc.MmapMarkers(markers, a.head.getBytesBuffer())
                        oooRecords = append(oooRecords, r)
                }

                if len(wblSamples) > 0 {
                        r := enc.Samples(wblSamples, a.head.getBytesBuffer())
                        oooRecords = append(oooRecords, r)
                }

                wblSamples = nil
                oooMmapMarkers = nil
        }
        for i, s := range a.samples {
                series = a.sampleSeries[i]
                series.Lock()

                oooSample, _, err := series.appendable(s.T, s.V, a.headMaxt, a.minValidTime, a.oooTimeWindow)
                switch {
                case err == nil:
                        // Do nothing.
                case errors.Is(err, storage.ErrOutOfOrderSample):
                        floatsAppended--
                        floatOOORejected++
                case errors.Is(err, storage.ErrOutOfBounds):
                        floatsAppended--
                        floatOOBRejected++
                case errors.Is(err, storage.ErrTooOldSample):
                        floatsAppended--
                        floatTooOldRejected++
                default:
                        floatsAppended--
                }

                var ok, chunkCreated bool

                switch {
                case err != nil:
                        // Do nothing here.
                case oooSample:
                        // Sample is OOO and OOO handling is enabled
                        // and the delta is within the OOO tolerance.
                        var mmapRef chunks.ChunkDiskMapperRef
                        ok, chunkCreated, mmapRef = series.insert(s.T, s.V, a.head.chunkDiskMapper, oooCapMax)
                        if chunkCreated {
                                r, ok := oooMmapMarkers[series.ref]
                                if !ok || r != 0 {
                                        // !ok means there are no markers collected for these samples yet. So we first flush the samples
                                        // before setting this m-map marker.

                                        // r != 0 means we have already m-mapped a chunk for this series in the same Commit().
                                        // Hence, before we m-map again, we should add the samples and m-map markers
                                        // seen till now to the WBL records.
                                        collectOOORecords()
                                }

                                if oooMmapMarkers == nil {
                                        oooMmapMarkers = make(map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef)
                                }
                                oooMmapMarkers[series.ref] = mmapRef
                        }
                        if ok {
                                wblSamples = append(wblSamples, s)
                                if s.T < ooomint {
                                        ooomint = s.T
                                }
                                if s.T > ooomaxt {
                                        ooomaxt = s.T
                                }
                                floatOOOAccepted++
                        } else {
                                // Sample is an exact duplicate of the last sample.
                                // NOTE: We can only detect updates if they clash with a sample in the OOOHeadChunk,
                                // not with samples in already flushed OOO chunks.
                                // TODO(codesome): Add error reporting? It depends on addressing https://github.com/prometheus/prometheus/discussions/10305.
                                floatsAppended--
                        }
                default:
                        ok, chunkCreated = series.append(s.T, s.V, a.appendID, appendChunkOpts)
                        if ok {
                                if s.T < inOrderMint {
                                        inOrderMint = s.T
                                }
                                if s.T > inOrderMaxt {
                                        inOrderMaxt = s.T
                                }
                        } else {
                                // The sample is an exact duplicate, and should be silently dropped.
                                floatsAppended--
                        }
                }

                if chunkCreated {
                        a.head.metrics.chunks.Inc()
                        a.head.metrics.chunksCreated.Inc()
                }

                series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
                series.pendingCommit = false
                series.Unlock()
        }

        for i, s := range a.histograms {
                series = a.histogramSeries[i]
                series.Lock()
                ok, chunkCreated := series.appendHistogram(s.T, s.H, a.appendID, appendChunkOpts)
                series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
                series.pendingCommit = false
                series.Unlock()

                if ok {
                        if s.T < inOrderMint {
                                inOrderMint = s.T
                        }
                        if s.T > inOrderMaxt {
                                inOrderMaxt = s.T
                        }
                } else {
                        histogramsAppended--
                        histoOOORejected++
                }
                if chunkCreated {
                        a.head.metrics.chunks.Inc()
                        a.head.metrics.chunksCreated.Inc()
                }
        }

        for i, s := range a.floatHistograms {
                series = a.floatHistogramSeries[i]
                series.Lock()
                ok, chunkCreated := series.appendFloatHistogram(s.T, s.FH, a.appendID, appendChunkOpts)
                series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
                series.pendingCommit = false
                series.Unlock()

                if ok {
                        if s.T < inOrderMint {
                                inOrderMint = s.T
                        }
                        if s.T > inOrderMaxt {
                                inOrderMaxt = s.T
                        }
                } else {
                        histogramsAppended--
                        histoOOORejected++
                }
                if chunkCreated {
                        a.head.metrics.chunks.Inc()
                        a.head.metrics.chunksCreated.Inc()
                }
        }

        for i, m := range a.metadata {
                series = a.metadataSeries[i]
                series.Lock()
                series.meta = &metadata.Metadata{Type: record.ToMetricType(m.Type), Unit: m.Unit, Help: m.Help}
                series.Unlock()
        }

        a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeFloat).Add(float64(floatOOORejected))
        a.head.metrics.outOfOrderSamples.WithLabelValues(sampleMetricTypeHistogram).Add(float64(histoOOORejected))
        a.head.metrics.outOfBoundSamples.WithLabelValues(sampleMetricTypeFloat).Add(float64(floatOOBRejected))
        a.head.metrics.tooOldSamples.WithLabelValues(sampleMetricTypeFloat).Add(float64(floatTooOldRejected))
        a.head.metrics.samplesAppended.WithLabelValues(sampleMetricTypeFloat).Add(float64(floatsAppended))
        a.head.metrics.samplesAppended.WithLabelValues(sampleMetricTypeHistogram).Add(float64(histogramsAppended))
        a.head.metrics.outOfOrderSamplesAppended.WithLabelValues(sampleMetricTypeFloat).Add(float64(floatOOOAccepted))
        a.head.updateMinMaxTime(inOrderMint, inOrderMaxt)
        a.head.updateMinOOOMaxOOOTime(ooomint, ooomaxt)

        collectOOORecords()
        if a.head.wbl != nil {
                if err := a.head.wbl.Log(oooRecords...); err != nil {
                        // TODO(codesome): Currently WBL logging of ooo samples is best effort here since we cannot try logging
                        // until we have found what samples become OOO. We can try having a metric for this failure.
                        // Returning the error here is not correct because we have already put the samples into the memory,
                        // hence the append/insert was a success.
                        level.Error(a.head.logger).Log("msg", "Failed to log out of order samples into the WAL", "err", err)
                }
        }
        return nil
}

// insert is like append, except it inserts. Used for OOO samples.
func (s *memSeries) insert(t int64, v float64, chunkDiskMapper *chunks.ChunkDiskMapper, oooCapMax int64) (inserted, chunkCreated bool, mmapRef chunks.ChunkDiskMapperRef) {
        if s.ooo == nil {
                s.ooo = &memSeriesOOOFields{}
        }
        c := s.ooo.oooHeadChunk
        if c == nil || c.chunk.NumSamples() == int(oooCapMax) {
                // Note: If no new samples come in then we rely on compaction to clean up stale in-memory OOO chunks.
                c, mmapRef = s.cutNewOOOHeadChunk(t, chunkDiskMapper)
                chunkCreated = true
        }

        ok := c.chunk.Insert(t, v)
        if ok {
                if chunkCreated || t < c.minTime {
                        c.minTime = t
                }
                if chunkCreated || t > c.maxTime {
                        c.maxTime = t
                }
        }
        return ok, chunkCreated, mmapRef
}

// chunkOpts are chunk-level options that are passed when appending to a memSeries.
type chunkOpts struct {
        chunkDiskMapper *chunks.ChunkDiskMapper
        chunkRange      int64
        samplesPerChunk int
}

// append adds the sample (t, v) to the series. The caller also has to provide
// the appendID for isolation. (The appendID can be zero, which results in no
// isolation for this append.)
// It is unsafe to call this concurrently with s.iterator(...) without holding the series lock.
func (s *memSeries) append(t int64, v float64, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) {
        c, sampleInOrder, chunkCreated := s.appendPreprocessor(t, chunkenc.EncXOR, o)
        if !sampleInOrder {
                return sampleInOrder, chunkCreated
        }
        s.app.Append(t, v)

        c.maxTime = t

        s.lastValue = v
        s.lastHistogramValue = nil
        s.lastFloatHistogramValue = nil

        if appendID > 0 {
                s.txs.add(appendID)
        }

        return true, chunkCreated
}

// appendHistogram adds the histogram.
// It is unsafe to call this concurrently with s.iterator(...) without holding the series lock.
// In case of recoding the existing chunk, a new chunk is allocated and the old chunk is dropped.
// To keep the meaning of prometheus_tsdb_head_chunks and prometheus_tsdb_head_chunks_created_total
// consistent, we return chunkCreated=false in this case.
func (s *memSeries) appendHistogram(t int64, h *histogram.Histogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) {
        // Head controls the execution of recoding, so that we own the proper
        // chunk reference afterwards and mmap used up chunks.

        // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway.
        prevApp, _ := s.app.(*chunkenc.HistogramAppender)

        c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.EncHistogram, o)
        if !sampleInOrder {
                return sampleInOrder, chunkCreated
        }

        var (
                newChunk chunkenc.Chunk
                recoded  bool
        )

        if !chunkCreated {
                // Ignore the previous appender if we continue the current chunk.
                prevApp = nil
        }

        newChunk, recoded, s.app, _ = s.app.AppendHistogram(prevApp, t, h, false) // false=request a new chunk if needed

        s.lastHistogramValue = h
        s.lastFloatHistogramValue = nil

        if appendID > 0 {
                s.txs.add(appendID)
        }

        if newChunk == nil { // Sample was appended to existing chunk or is the first sample in a new chunk.
                c.maxTime = t
                return true, chunkCreated
        }

        if recoded { // The appender needed to recode the chunk.
                c.maxTime = t
                c.chunk = newChunk
                return true, false
        }

        s.headChunks = &memChunk{
                chunk:   newChunk,
                minTime: t,
                maxTime: t,
                prev:    s.headChunks,
        }
        s.nextAt = rangeForTimestamp(t, o.chunkRange)
        return true, true
}

// appendFloatHistogram adds the float histogram.
// It is unsafe to call this concurrently with s.iterator(...) without holding the series lock.
// In case of recoding the existing chunk, a new chunk is allocated and the old chunk is dropped.
// To keep the meaning of prometheus_tsdb_head_chunks and prometheus_tsdb_head_chunks_created_total
// consistent, we return chunkCreated=false in this case.
func (s *memSeries) appendFloatHistogram(t int64, fh *histogram.FloatHistogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) {
        // Head controls the execution of recoding, so that we own the proper
        // chunk reference afterwards and mmap used up chunks.

        // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway.
        prevApp, _ := s.app.(*chunkenc.FloatHistogramAppender)

        c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.EncFloatHistogram, o)
        if !sampleInOrder {
                return sampleInOrder, chunkCreated
        }

        var (
                newChunk chunkenc.Chunk
                recoded  bool
        )

        if !chunkCreated {
                // Ignore the previous appender if we continue the current chunk.
                prevApp = nil
        }

        newChunk, recoded, s.app, _ = s.app.AppendFloatHistogram(prevApp, t, fh, false) // False means request a new chunk if needed.

        s.lastHistogramValue = nil
        s.lastFloatHistogramValue = fh

        if appendID > 0 {
                s.txs.add(appendID)
        }

        if newChunk == nil { // Sample was appended to existing chunk or is the first sample in a new chunk.
                c.maxTime = t
                return true, chunkCreated
        }

        if recoded { // The appender needed to recode the chunk.
                c.maxTime = t
                c.chunk = newChunk
                return true, false
        }

        s.headChunks = &memChunk{
                chunk:   newChunk,
                minTime: t,
                maxTime: t,
                prev:    s.headChunks,
        }
        s.nextAt = rangeForTimestamp(t, o.chunkRange)
        return true, true
}

// appendPreprocessor takes care of cutting new XOR chunks and m-mapping old ones. XOR chunks are cut based on the
// number of samples they contain with a soft cap in bytes.
// It is unsafe to call this concurrently with s.iterator(...) without holding the series lock.
// This should be called only when appending data.
func (s *memSeries) appendPreprocessor(t int64, e chunkenc.Encoding, o chunkOpts) (c *memChunk, sampleInOrder, chunkCreated bool) {
        // We target chunkenc.MaxBytesPerXORChunk as a hard for the size of an XOR chunk. We must determine whether to cut
        // a new head chunk without knowing the size of the next sample, however, so we assume the next sample will be a
        // maximally-sized sample (19 bytes).
        const maxBytesPerXORChunk = chunkenc.MaxBytesPerXORChunk - 19

        c = s.headChunks

        if c == nil {
                if len(s.mmappedChunks) > 0 && s.mmappedChunks[len(s.mmappedChunks)-1].maxTime >= t {
                        // Out of order sample. Sample timestamp is already in the mmapped chunks, so ignore it.
                        return c, false, false
                }
                // There is no head chunk in this series yet, create the first chunk for the sample.
                c = s.cutNewHeadChunk(t, e, o.chunkRange)
                chunkCreated = true
        }

        // Out of order sample.
        if c.maxTime >= t {
                return c, false, chunkCreated
        }

        // Check the chunk size, unless we just created it and if the chunk is too large, cut a new one.
        if !chunkCreated && len(c.chunk.Bytes()) > maxBytesPerXORChunk {
                c = s.cutNewHeadChunk(t, e, o.chunkRange)
                chunkCreated = true
        }

        if c.chunk.Encoding() != e {
                // The chunk encoding expected by this append is different than the head chunk's
                // encoding. So we cut a new chunk with the expected encoding.
                c = s.cutNewHeadChunk(t, e, o.chunkRange)
                chunkCreated = true
        }

        numSamples := c.chunk.NumSamples()
        if numSamples == 0 {
                // It could be the new chunk created after reading the chunk snapshot,
                // hence we fix the minTime of the chunk here.
                c.minTime = t
                s.nextAt = rangeForTimestamp(c.minTime, o.chunkRange)
        }

        // If we reach 25% of a chunk's desired sample count, predict an end time
        // for this chunk that will try to make samples equally distributed within
        // the remaining chunks in the current chunk range.
        // At latest it must happen at the timestamp set when the chunk was cut.
        if numSamples == o.samplesPerChunk/4 {
                s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, s.nextAt, 4)
        }
        // If numSamples > samplesPerChunk*2 then our previous prediction was invalid,
        // most likely because samples rate has changed and now they are arriving more frequently.
        // Since we assume that the rate is higher, we're being conservative and cutting at 2*samplesPerChunk
        // as we expect more chunks to come.
        // Note that next chunk will have its nextAt recalculated for the new rate.
        if t >= s.nextAt || numSamples >= o.samplesPerChunk*2 {
                c = s.cutNewHeadChunk(t, e, o.chunkRange)
                chunkCreated = true
        }

        return c, true, chunkCreated
}

// histogramsAppendPreprocessor takes care of cutting new histogram chunks and m-mapping old ones. Histogram chunks are
// cut based on their size in bytes.
// It is unsafe to call this concurrently with s.iterator(...) without holding the series lock.
// This should be called only when appending data.
func (s *memSeries) histogramsAppendPreprocessor(t int64, e chunkenc.Encoding, o chunkOpts) (c *memChunk, sampleInOrder, chunkCreated bool) {
        c = s.headChunks

        if c == nil {
                if len(s.mmappedChunks) > 0 && s.mmappedChunks[len(s.mmappedChunks)-1].maxTime >= t {
                        // Out of order sample. Sample timestamp is already in the mmapped chunks, so ignore it.
                        return c, false, false
                }
                // There is no head chunk in this series yet, create the first chunk for the sample.
                c = s.cutNewHeadChunk(t, e, o.chunkRange)
                chunkCreated = true
        }

        // Out of order sample.
        if c.maxTime >= t {
                return c, false, chunkCreated
        }

        if c.chunk.Encoding() != e {
                // The chunk encoding expected by this append is different than the head chunk's
                // encoding. So we cut a new chunk with the expected encoding.
                c = s.cutNewHeadChunk(t, e, o.chunkRange)
                chunkCreated = true
        }

        numSamples := c.chunk.NumSamples()
        targetBytes := chunkenc.TargetBytesPerHistogramChunk
        numBytes := len(c.chunk.Bytes())

        if numSamples == 0 {
                // It could be the new chunk created after reading the chunk snapshot,
                // hence we fix the minTime of the chunk here.
                c.minTime = t
                s.nextAt = rangeForTimestamp(c.minTime, o.chunkRange)
        }

        // Below, we will enforce chunkenc.MinSamplesPerHistogramChunk. There are, however, two cases that supersede it:
        //  - The current chunk range is ending before chunkenc.MinSamplesPerHistogramChunk will be satisfied.
        //  - s.nextAt was set while loading a chunk snapshot with the intent that a new chunk be cut on the next append.
        var nextChunkRangeStart int64
        if s.histogramChunkHasComputedEndTime {
                nextChunkRangeStart = rangeForTimestamp(c.minTime, o.chunkRange)
        } else {
                // If we haven't yet computed an end time yet, s.nextAt is either set to
                // rangeForTimestamp(c.minTime, o.chunkRange) or was set while loading a chunk snapshot. Either way, we want to
                // skip enforcing chunkenc.MinSamplesPerHistogramChunk.
                nextChunkRangeStart = s.nextAt
        }

        // If we reach 25% of a chunk's desired maximum size, predict an end time
        // for this chunk that will try to make samples equally distributed within
        // the remaining chunks in the current chunk range.
        // At the latest it must happen at the timestamp set when the chunk was cut.
        if !s.histogramChunkHasComputedEndTime && numBytes >= targetBytes/4 {
                ratioToFull := float64(targetBytes) / float64(numBytes)
                s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, s.nextAt, ratioToFull)
                s.histogramChunkHasComputedEndTime = true
        }
        // If numBytes > targetBytes*2 then our previous prediction was invalid. This could happen if the sample rate has
        // increased or if the bucket/span count has increased.
        // Note that next chunk will have its nextAt recalculated for the new rate.
        if (t >= s.nextAt || numBytes >= targetBytes*2) && (numSamples >= chunkenc.MinSamplesPerHistogramChunk || t >= nextChunkRangeStart) {
                c = s.cutNewHeadChunk(t, e, o.chunkRange)
                chunkCreated = true
        }

        // The new chunk will also need a new computed end time.
        if chunkCreated {
                s.histogramChunkHasComputedEndTime = false
        }

        return c, true, chunkCreated
}

// computeChunkEndTime estimates the end timestamp based the beginning of a
// chunk, its current timestamp and the upper bound up to which we insert data.
// It assumes that the time range is 1/ratioToFull full.
// Assuming that the samples will keep arriving at the same rate, it will make the
// remaining n chunks within this chunk range (before max) equally sized.
func computeChunkEndTime(start, cur, max int64, ratioToFull float64) int64 {
        n := float64(max-start) / (float64(cur-start+1) * ratioToFull)
        if n <= 1 {
                return max
        }
        return int64(float64(start) + float64(max-start)/math.Floor(n))
}

func (s *memSeries) cutNewHeadChunk(mint int64, e chunkenc.Encoding, chunkRange int64) *memChunk {
        // When cutting a new head chunk we create a new memChunk instance with .prev
        // pointing at the current .headChunks, so it forms a linked list.
        // All but first headChunks list elements will be m-mapped as soon as possible
        // so this is a single element list most of the time.
        s.headChunks = &memChunk{
                minTime: mint,
                maxTime: math.MinInt64,
                prev:    s.headChunks,
        }

        if chunkenc.IsValidEncoding(e) {
                var err error
                s.headChunks.chunk, err = chunkenc.NewEmptyChunk(e)
                if err != nil {
                        panic(err) // This should never happen.
                }
        } else {
                s.headChunks.chunk = chunkenc.NewXORChunk()
        }

        // Set upper bound on when the next chunk must be started. An earlier timestamp
        // may be chosen dynamically at a later point.
        s.nextAt = rangeForTimestamp(mint, chunkRange)

        app, err := s.headChunks.chunk.Appender()
        if err != nil {
                panic(err)
        }
        s.app = app
        return s.headChunks
}

// cutNewOOOHeadChunk cuts a new OOO chunk and m-maps the old chunk.
// The caller must ensure that s.ooo is not nil.
func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper) (*oooHeadChunk, chunks.ChunkDiskMapperRef) {
        ref := s.mmapCurrentOOOHeadChunk(chunkDiskMapper)

        s.ooo.oooHeadChunk = &oooHeadChunk{
                chunk:   NewOOOChunk(),
                minTime: mint,
                maxTime: math.MinInt64,
        }

        return s.ooo.oooHeadChunk, ref
}

func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) chunks.ChunkDiskMapperRef {
        if s.ooo == nil || s.ooo.oooHeadChunk == nil {
                // There is no head chunk, so nothing to m-map here.
                return 0
        }
        xor, _ := s.ooo.oooHeadChunk.chunk.ToXOR() // Encode to XorChunk which is more compact and implements all of the needed functionality.
        chunkRef := chunkDiskMapper.WriteChunk(s.ref, s.ooo.oooHeadChunk.minTime, s.ooo.oooHeadChunk.maxTime, xor, true, handleChunkWriteError)
        s.ooo.oooMmappedChunks = append(s.ooo.oooMmappedChunks, &mmappedChunk{
                ref:        chunkRef,
                numSamples: uint16(xor.NumSamples()),
                minTime:    s.ooo.oooHeadChunk.minTime,
                maxTime:    s.ooo.oooHeadChunk.maxTime,
        })
        s.ooo.oooHeadChunk = nil
        return chunkRef
}

// mmapChunks will m-map all but first chunk on s.headChunks list.
func (s *memSeries) mmapChunks(chunkDiskMapper *chunks.ChunkDiskMapper) (count int) {
        if s.headChunks == nil || s.headChunks.prev == nil {
                // There is none or only one head chunk, so nothing to m-map here.
                return
        }

        // Write chunks starting from the oldest one and stop before we get to current s.headChunks.
        // If we have this chain: s.headChunks{t4} -> t3 -> t2 -> t1 -> t0
        // then we need to write chunks t0 to t3, but skip s.headChunks.
        for i := s.headChunks.len() - 1; i > 0; i-- {
                chk := s.headChunks.atOffset(i)
                chunkRef := chunkDiskMapper.WriteChunk(s.ref, chk.minTime, chk.maxTime, chk.chunk, false, handleChunkWriteError)
                s.mmappedChunks = append(s.mmappedChunks, &mmappedChunk{
                        ref:        chunkRef,
                        numSamples: uint16(chk.chunk.NumSamples()),
                        minTime:    chk.minTime,
                        maxTime:    chk.maxTime,
                })
                count++
        }

        // Once we've written out all chunks except s.headChunks we need to unlink these from s.headChunk.
        s.headChunks.prev = nil

        return count
}

func handleChunkWriteError(err error) {
        if err != nil && !errors.Is(err, chunks.ErrChunkDiskMapperClosed) {
                panic(err)
        }
}

// Rollback removes the samples and exemplars from headAppender and writes any series to WAL.
func (a *headAppender) Rollback() (err error) {
        if a.closed {
                return ErrAppenderClosed
        }
        defer func() { a.closed = true }()
        defer a.head.metrics.activeAppenders.Dec()
        defer a.head.iso.closeAppend(a.appendID)
        defer a.head.putSeriesBuffer(a.sampleSeries)

        var series *memSeries
        for i := range a.samples {
                series = a.sampleSeries[i]
                series.Lock()
                series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
                series.pendingCommit = false
                series.Unlock()
        }
        for i := range a.histograms {
                series = a.histogramSeries[i]
                series.Lock()
                series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
                series.pendingCommit = false
                series.Unlock()
        }
        a.head.putAppendBuffer(a.samples)
        a.head.putExemplarBuffer(a.exemplars)
        a.head.putHistogramBuffer(a.histograms)
        a.head.putFloatHistogramBuffer(a.floatHistograms)
        a.head.putMetadataBuffer(a.metadata)
        a.samples = nil
        a.exemplars = nil
        a.histograms = nil
        a.metadata = nil

        // Series are created in the head memory regardless of rollback. Thus we have
        // to log them to the WAL in any case.
        return a.log()
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "errors"
        "fmt"
        "math"
        "slices"
        "sync"

        "github.com/go-kit/log/level"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        "github.com/prometheus/prometheus/tsdb/index"
)

func (h *Head) ExemplarQuerier(ctx context.Context) (storage.ExemplarQuerier, error) {
        return h.exemplars.ExemplarQuerier(ctx)
}

// Index returns an IndexReader against the block.
func (h *Head) Index() (IndexReader, error) {
        return h.indexRange(math.MinInt64, math.MaxInt64), nil
}

func (h *Head) indexRange(mint, maxt int64) *headIndexReader {
        if hmin := h.MinTime(); hmin > mint {
                mint = hmin
        }
        return &headIndexReader{head: h, mint: mint, maxt: maxt}
}

type headIndexReader struct {
        head       *Head
        mint, maxt int64
}

func (h *headIndexReader) Close() error {
        return nil
}

func (h *headIndexReader) Symbols() index.StringIter {
        return h.head.postings.Symbols()
}

// SortedLabelValues returns label values present in the head for the
// specific label name that are within the time range mint to maxt.
// If matchers are specified the returned result set is reduced
// to label values of metrics matching the matchers.
func (h *headIndexReader) SortedLabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
        values, err := h.LabelValues(ctx, name, matchers...)
        if err == nil {
                slices.Sort(values)
        }
        return values, err
}

// LabelValues returns label values present in the head for the
// specific label name that are within the time range mint to maxt.
// If matchers are specified the returned result set is reduced
// to label values of metrics matching the matchers.
func (h *headIndexReader) LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
        if h.maxt < h.head.MinTime() || h.mint > h.head.MaxTime() {
                return []string{}, nil
        }

        if len(matchers) == 0 {
                return h.head.postings.LabelValues(ctx, name), nil
        }

        return labelValuesWithMatchers(ctx, h, name, matchers...)
}

// LabelNames returns all the unique label names present in the head
// that are within the time range mint to maxt.
func (h *headIndexReader) LabelNames(ctx context.Context, matchers ...*labels.Matcher) ([]string, error) {
        if h.maxt < h.head.MinTime() || h.mint > h.head.MaxTime() {
                return []string{}, nil
        }

        if len(matchers) == 0 {
                labelNames := h.head.postings.LabelNames()
                slices.Sort(labelNames)
                return labelNames, nil
        }

        return labelNamesWithMatchers(ctx, h, matchers...)
}

// Postings returns the postings list iterator for the label pairs.
func (h *headIndexReader) Postings(ctx context.Context, name string, values ...string) (index.Postings, error) {
        switch len(values) {
        case 0:
                return index.EmptyPostings(), nil
        case 1:
                return h.head.postings.Get(name, values[0]), nil
        default:
                res := make([]index.Postings, 0, len(values))
                for _, value := range values {
                        if p := h.head.postings.Get(name, value); !index.IsEmptyPostingsType(p) {
                                res = append(res, p)
                        }
                }
                return index.Merge(ctx, res...), nil
        }
}

func (h *headIndexReader) PostingsForLabelMatching(ctx context.Context, name string, match func(string) bool) index.Postings {
        return h.head.postings.PostingsForLabelMatching(ctx, name, match)
}

func (h *headIndexReader) SortedPostings(p index.Postings) index.Postings {
        series := make([]*memSeries, 0, 128)

        // Fetch all the series only once.
        for p.Next() {
                s := h.head.series.getByID(chunks.HeadSeriesRef(p.At()))
                if s == nil {
                        level.Debug(h.head.logger).Log("msg", "Looked up series not found")
                } else {
                        series = append(series, s)
                }
        }
        if err := p.Err(); err != nil {
                return index.ErrPostings(fmt.Errorf("expand postings: %w", err))
        }

        slices.SortFunc(series, func(a, b *memSeries) int {
                return labels.Compare(a.lset, b.lset)
        })

        // Convert back to list.
        ep := make([]storage.SeriesRef, 0, len(series))
        for _, p := range series {
                ep = append(ep, storage.SeriesRef(p.ref))
        }
        return index.NewListPostings(ep)
}

// ShardedPostings implements IndexReader. This function returns an failing postings list if sharding
// has not been enabled in the Head.
func (h *headIndexReader) ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings {
        if !h.head.opts.EnableSharding {
                return index.ErrPostings(errors.New("sharding is disabled"))
        }

        out := make([]storage.SeriesRef, 0, 128)

        for p.Next() {
                s := h.head.series.getByID(chunks.HeadSeriesRef(p.At()))
                if s == nil {
                        level.Debug(h.head.logger).Log("msg", "Looked up series not found")
                        continue
                }

                // Check if the series belong to the shard.
                if s.shardHash%shardCount != shardIndex {
                        continue
                }

                out = append(out, storage.SeriesRef(s.ref))
        }

        return index.NewListPostings(out)
}

// Series returns the series for the given reference.
// Chunks are skipped if chks is nil.
func (h *headIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
        s := h.head.series.getByID(chunks.HeadSeriesRef(ref))

        if s == nil {
                h.head.metrics.seriesNotFound.Inc()
                return storage.ErrNotFound
        }
        builder.Assign(s.lset)

        if chks == nil {
                return nil
        }

        s.Lock()
        defer s.Unlock()

        *chks = (*chks)[:0]

        for i, c := range s.mmappedChunks {
                // Do not expose chunks that are outside of the specified range.
                if !c.OverlapsClosedInterval(h.mint, h.maxt) {
                        continue
                }
                *chks = append(*chks, chunks.Meta{
                        MinTime: c.minTime,
                        MaxTime: c.maxTime,
                        Ref:     chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.headChunkID(i))),
                })
        }

        if s.headChunks != nil {
                var maxTime int64
                var i, j int
                for i = s.headChunks.len() - 1; i >= 0; i-- {
                        chk := s.headChunks.atOffset(i)
                        if i == 0 {
                                // Set the head chunk as open (being appended to) for the first headChunk.
                                maxTime = math.MaxInt64
                        } else {
                                maxTime = chk.maxTime
                        }
                        if chk.OverlapsClosedInterval(h.mint, h.maxt) {
                                *chks = append(*chks, chunks.Meta{
                                        MinTime: chk.minTime,
                                        MaxTime: maxTime,
                                        Ref:     chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.headChunkID(len(s.mmappedChunks)+j))),
                                })
                        }
                        j++
                }
        }

        return nil
}

// headChunkID returns the HeadChunkID referred to by the given position.
// * 0 <= pos < len(s.mmappedChunks) refer to s.mmappedChunks[pos]
// * pos >= len(s.mmappedChunks) refers to s.headChunks linked list.
func (s *memSeries) headChunkID(pos int) chunks.HeadChunkID {
        return chunks.HeadChunkID(pos) + s.firstChunkID
}

// oooHeadChunkID returns the HeadChunkID referred to by the given position.
// * 0 <= pos < len(s.oooMmappedChunks) refer to s.oooMmappedChunks[pos]
// * pos == len(s.oooMmappedChunks) refers to s.oooHeadChunk
// The caller must ensure that s.ooo is not nil.
func (s *memSeries) oooHeadChunkID(pos int) chunks.HeadChunkID {
        return chunks.HeadChunkID(pos) + s.ooo.firstOOOChunkID
}

// LabelValueFor returns label value for the given label name in the series referred to by ID.
func (h *headIndexReader) LabelValueFor(_ context.Context, id storage.SeriesRef, label string) (string, error) {
        memSeries := h.head.series.getByID(chunks.HeadSeriesRef(id))
        if memSeries == nil {
                return "", storage.ErrNotFound
        }

        value := memSeries.lset.Get(label)
        if value == "" {
                return "", storage.ErrNotFound
        }

        return value, nil
}

// LabelNamesFor returns all the label names for the series referred to by the postings.
// The names returned are sorted.
func (h *headIndexReader) LabelNamesFor(ctx context.Context, series index.Postings) ([]string, error) {
        namesMap := make(map[string]struct{})
        i := 0
        for series.Next() {
                i++
                if i%checkContextEveryNIterations == 0 && ctx.Err() != nil {
                        return nil, ctx.Err()
                }
                memSeries := h.head.series.getByID(chunks.HeadSeriesRef(series.At()))
                if memSeries == nil {
                        // Series not found, this happens during compaction,
                        // when series was garbage collected after the caller got the series IDs.
                        continue
                }
                memSeries.lset.Range(func(lbl labels.Label) {
                        namesMap[lbl.Name] = struct{}{}
                })
        }
        if err := series.Err(); err != nil {
                return nil, err
        }
        names := make([]string, 0, len(namesMap))
        for name := range namesMap {
                names = append(names, name)
        }
        slices.Sort(names)
        return names, nil
}

// Chunks returns a ChunkReader against the block.
func (h *Head) Chunks() (ChunkReader, error) {
        return h.chunksRange(math.MinInt64, math.MaxInt64, h.iso.State(math.MinInt64, math.MaxInt64))
}

func (h *Head) chunksRange(mint, maxt int64, is *isolationState) (*headChunkReader, error) {
        h.closedMtx.Lock()
        defer h.closedMtx.Unlock()
        if h.closed {
                return nil, errors.New("can't read from a closed head")
        }
        if hmin := h.MinTime(); hmin > mint {
                mint = hmin
        }
        return &headChunkReader{
                head:     h,
                mint:     mint,
                maxt:     maxt,
                isoState: is,
        }, nil
}

type headChunkReader struct {
        head       *Head
        mint, maxt int64
        isoState   *isolationState
}

func (h *headChunkReader) Close() error {
        if h.isoState != nil {
                h.isoState.Close()
        }
        return nil
}

// ChunkOrIterable returns the chunk for the reference number.
func (h *headChunkReader) ChunkOrIterable(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
        chk, _, err := h.chunk(meta, false)
        return chk, nil, err
}

// ChunkWithCopy returns the chunk for the reference number.
// If the chunk is the in-memory chunk, then it makes a copy and returns the copied chunk.
func (h *headChunkReader) ChunkWithCopy(meta chunks.Meta) (chunkenc.Chunk, int64, error) {
        return h.chunk(meta, true)
}

// chunk returns the chunk for the reference number.
// If copyLastChunk is true, then it makes a copy of the head chunk if asked for it.
// Also returns max time of the chunk.
func (h *headChunkReader) chunk(meta chunks.Meta, copyLastChunk bool) (chunkenc.Chunk, int64, error) {
        sid, cid := chunks.HeadChunkRef(meta.Ref).Unpack()

        s := h.head.series.getByID(sid)
        // This means that the series has been garbage collected.
        if s == nil {
                return nil, 0, storage.ErrNotFound
        }

        s.Lock()
        c, headChunk, isOpen, err := s.chunk(cid, h.head.chunkDiskMapper, &h.head.memChunkPool)
        if err != nil {
                s.Unlock()
                return nil, 0, err
        }
        defer func() {
                if !headChunk {
                        // Set this to nil so that Go GC can collect it after it has been used.
                        c.chunk = nil
                        c.prev = nil
                        h.head.memChunkPool.Put(c)
                }
        }()

        // This means that the chunk is outside the specified range.
        if !c.OverlapsClosedInterval(h.mint, h.maxt) {
                s.Unlock()
                return nil, 0, storage.ErrNotFound
        }

        chk, maxTime := c.chunk, c.maxTime
        if headChunk && isOpen && copyLastChunk {
                // The caller may ask to copy the head chunk in order to take the
                // bytes of the chunk without causing the race between read and append.
                b := s.headChunks.chunk.Bytes()
                newB := make([]byte, len(b))
                copy(newB, b) // TODO(codesome): Use bytes.Clone() when we upgrade to Go 1.20.
                // TODO(codesome): Put back in the pool (non-trivial).
                chk, err = h.head.opts.ChunkPool.Get(s.headChunks.chunk.Encoding(), newB)
                if err != nil {
                        return nil, 0, err
                }
        }
        s.Unlock()

        return &safeHeadChunk{
                Chunk:    chk,
                s:        s,
                cid:      cid,
                isoState: h.isoState,
        }, maxTime, nil
}

// chunk returns the chunk for the HeadChunkID from memory or by m-mapping it from the disk.
// If headChunk is false, it means that the returned *memChunk
// (and not the chunkenc.Chunk inside it) can be garbage collected after its usage.
// if isOpen is true, it means that the returned *memChunk is used for appends.
func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDiskMapper, memChunkPool *sync.Pool) (chunk *memChunk, headChunk, isOpen bool, err error) {
        // ix represents the index of chunk in the s.mmappedChunks slice. The chunk id's are
        // incremented by 1 when new chunk is created, hence (id - firstChunkID) gives the slice index.
        // The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
        // is >= len(s.mmappedChunks), it represents one of the chunks on s.headChunks linked list.
        // The order of elemens is different for slice and linked list.
        // For s.mmappedChunks slice newer chunks are appended to it.
        // For s.headChunks list newer chunks are prepended to it.
        //
        // memSeries {
        //   mmappedChunks: [t0, t1, t2]
        //   headChunk:     {t5}->{t4}->{t3}
        // }
        ix := int(id) - int(s.firstChunkID)

        var headChunksLen int
        if s.headChunks != nil {
                headChunksLen = s.headChunks.len()
        }

        if ix < 0 || ix > len(s.mmappedChunks)+headChunksLen-1 {
                return nil, false, false, storage.ErrNotFound
        }

        if ix < len(s.mmappedChunks) {
                chk, err := chunkDiskMapper.Chunk(s.mmappedChunks[ix].ref)
                if err != nil {
                        var cerr *chunks.CorruptionErr
                        if errors.As(err, &cerr) {
                                panic(err)
                        }
                        return nil, false, false, err
                }
                mc := memChunkPool.Get().(*memChunk)
                mc.chunk = chk
                mc.minTime = s.mmappedChunks[ix].minTime
                mc.maxTime = s.mmappedChunks[ix].maxTime
                return mc, false, false, nil
        }

        ix -= len(s.mmappedChunks)

        offset := headChunksLen - ix - 1
        // headChunks is a linked list where first element is the most recent one and the last one is the oldest.
        // This order is reversed when compared with mmappedChunks, since mmappedChunks[0] is the oldest chunk,
        // while headChunk.atOffset(0) would give us the most recent chunk.
        // So when calling headChunk.atOffset() we need to reverse the value of ix.
        elem := s.headChunks.atOffset(offset)
        if elem == nil {
                // This should never really happen and would mean that headChunksLen value is NOT equal
                // to the length of the headChunks list.
                return nil, false, false, storage.ErrNotFound
        }
        return elem, true, offset == 0, nil
}

// oooMergedChunks return an iterable over one or more OOO chunks for the given
// chunks.Meta reference from memory or by m-mapping it from the disk. The
// returned iterable will be a merge of all the overlapping chunks, if any,
// amongst all the chunks in the OOOHead.
// This function is not thread safe unless the caller holds a lock.
// The caller must ensure that s.ooo is not nil.
func (s *memSeries) oooMergedChunks(meta chunks.Meta, cdm *chunks.ChunkDiskMapper, mint, maxt int64) (*mergedOOOChunks, error) {
        _, cid := chunks.HeadChunkRef(meta.Ref).Unpack()

        // ix represents the index of chunk in the s.mmappedChunks slice. The chunk meta's are
        // incremented by 1 when new chunk is created, hence (meta - firstChunkID) gives the slice index.
        // The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
        // is len(s.mmappedChunks), it represents the next chunk, which is the head chunk.
        ix := int(cid) - int(s.ooo.firstOOOChunkID)
        if ix < 0 || ix > len(s.ooo.oooMmappedChunks) {
                return nil, storage.ErrNotFound
        }

        if ix == len(s.ooo.oooMmappedChunks) {
                if s.ooo.oooHeadChunk == nil {
                        return nil, errors.New("invalid ooo head chunk")
                }
        }

        // We create a temporary slice of chunk metas to hold the information of all
        // possible chunks that may overlap with the requested chunk.
        tmpChks := make([]chunkMetaAndChunkDiskMapperRef, 0, len(s.ooo.oooMmappedChunks))

        oooHeadRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.ooo.oooMmappedChunks))))
        if s.ooo.oooHeadChunk != nil && s.ooo.oooHeadChunk.OverlapsClosedInterval(mint, maxt) {
                // We only want to append the head chunk if this chunk existed when
                // Series() was called. This brings consistency in case new data
                // is added in between Series() and Chunk() calls.
                if oooHeadRef == meta.OOOLastRef {
                        tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
                                meta: chunks.Meta{
                                        // Ignoring samples added before and after the last known min and max time for this chunk.
                                        MinTime: meta.OOOLastMinTime,
                                        MaxTime: meta.OOOLastMaxTime,
                                        Ref:     oooHeadRef,
                                },
                        })
                }
        }

        for i, c := range s.ooo.oooMmappedChunks {
                chunkRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
                // We can skip chunks that came in later than the last known OOOLastRef.
                if chunkRef > meta.OOOLastRef {
                        break
                }

                switch {
                case chunkRef == meta.OOOLastRef:
                        tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
                                meta: chunks.Meta{
                                        MinTime: meta.OOOLastMinTime,
                                        MaxTime: meta.OOOLastMaxTime,
                                        Ref:     chunkRef,
                                },
                                ref:      c.ref,
                                origMinT: c.minTime,
                                origMaxT: c.maxTime,
                        })
                case c.OverlapsClosedInterval(mint, maxt):
                        tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
                                meta: chunks.Meta{
                                        MinTime: c.minTime,
                                        MaxTime: c.maxTime,
                                        Ref:     chunkRef,
                                },
                                ref: c.ref,
                        })
                }
        }

        // Next we want to sort all the collected chunks by min time so we can find
        // those that overlap and stop when we know the rest don't.
        slices.SortFunc(tmpChks, refLessByMinTimeAndMinRef)

        mc := &mergedOOOChunks{}
        absoluteMax := int64(math.MinInt64)
        for _, c := range tmpChks {
                if c.meta.Ref != meta.Ref && (len(mc.chunkIterables) == 0 || c.meta.MinTime > absoluteMax) {
                        continue
                }
                var iterable chunkenc.Iterable
                if c.meta.Ref == oooHeadRef {
                        var xor *chunkenc.XORChunk
                        var err error
                        // If head chunk min and max time match the meta OOO markers
                        // that means that the chunk has not expanded so we can append
                        // it as it is.
                        if s.ooo.oooHeadChunk.minTime == meta.OOOLastMinTime && s.ooo.oooHeadChunk.maxTime == meta.OOOLastMaxTime {
                                xor, err = s.ooo.oooHeadChunk.chunk.ToXOR() // TODO(jesus.vazquez) (This is an optimization idea that has no priority and might not be that useful) See if we could use a copy of the underlying slice. That would leave the more expensive ToXOR() function only for the usecase where Bytes() is called.
                        } else {
                                // We need to remove samples that are outside of the markers
                                xor, err = s.ooo.oooHeadChunk.chunk.ToXORBetweenTimestamps(meta.OOOLastMinTime, meta.OOOLastMaxTime)
                        }
                        if err != nil {
                                return nil, fmt.Errorf("failed to convert ooo head chunk to xor chunk: %w", err)
                        }
                        iterable = xor
                } else {
                        chk, err := cdm.Chunk(c.ref)
                        if err != nil {
                                var cerr *chunks.CorruptionErr
                                if errors.As(err, &cerr) {
                                        return nil, fmt.Errorf("invalid ooo mmapped chunk: %w", err)
                                }
                                return nil, err
                        }
                        if c.meta.Ref == meta.OOOLastRef &&
                                (c.origMinT != meta.OOOLastMinTime || c.origMaxT != meta.OOOLastMaxTime) {
                                // The head expanded and was memory mapped so now we need to
                                // wrap the chunk within a chunk that doesnt allows us to iterate
                                // through samples out of the OOOLastMinT and OOOLastMaxT
                                // markers.
                                iterable = boundedIterable{chk, meta.OOOLastMinTime, meta.OOOLastMaxTime}
                        } else {
                                iterable = chk
                        }
                }
                mc.chunkIterables = append(mc.chunkIterables, iterable)
                if c.meta.MaxTime > absoluteMax {
                        absoluteMax = c.meta.MaxTime
                }
        }

        return mc, nil
}

var _ chunkenc.Iterable = &boundedIterable{}

// boundedIterable is an implementation of chunkenc.Iterable that uses a
// boundedIterator that only iterates through samples which timestamps are
// >= minT and <= maxT.
type boundedIterable struct {
        chunk chunkenc.Chunk
        minT  int64
        maxT  int64
}

func (b boundedIterable) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
        it := b.chunk.Iterator(iterator)
        if it == nil {
                panic("iterator shouldn't be nil")
        }
        return boundedIterator{it, b.minT, b.maxT}
}

var _ chunkenc.Iterator = &boundedIterator{}

// boundedIterator is an implementation of Iterator that only iterates through
// samples which timestamps are >= minT and <= maxT.
type boundedIterator struct {
        chunkenc.Iterator
        minT int64
        maxT int64
}

// Next the first time its called it will advance as many positions as necessary
// until its able to find a sample within the bounds minT and maxT.
// If there are samples within bounds it will advance one by one amongst them.
// If there are no samples within bounds it will return false.
func (b boundedIterator) Next() chunkenc.ValueType {
        for b.Iterator.Next() == chunkenc.ValFloat {
                t, _ := b.Iterator.At()
                switch {
                case t < b.minT:
                        continue
                case t > b.maxT:
                        return chunkenc.ValNone
                default:
                        return chunkenc.ValFloat
                }
        }
        return chunkenc.ValNone
}

func (b boundedIterator) Seek(t int64) chunkenc.ValueType {
        if t < b.minT {
                // We must seek at least up to b.minT if it is asked for something before that.
                val := b.Iterator.Seek(b.minT)
                if !(val == chunkenc.ValFloat) {
                        return chunkenc.ValNone
                }
                t, _ := b.Iterator.At()
                if t <= b.maxT {
                        return chunkenc.ValFloat
                }
        }
        if t > b.maxT {
                // We seek anyway so that the subsequent Next() calls will also return false.
                b.Iterator.Seek(t)
                return chunkenc.ValNone
        }
        return b.Iterator.Seek(t)
}

// safeHeadChunk makes sure that the chunk can be accessed without a race condition.
type safeHeadChunk struct {
        chunkenc.Chunk
        s        *memSeries
        cid      chunks.HeadChunkID
        isoState *isolationState
}

func (c *safeHeadChunk) Iterator(reuseIter chunkenc.Iterator) chunkenc.Iterator {
        c.s.Lock()
        it := c.s.iterator(c.cid, c.Chunk, c.isoState, reuseIter)
        c.s.Unlock()
        return it
}

// iterator returns a chunk iterator for the requested chunkID, or a NopIterator if the requested ID is out of range.
// It is unsafe to call this concurrently with s.append(...) without holding the series lock.
func (s *memSeries) iterator(id chunks.HeadChunkID, c chunkenc.Chunk, isoState *isolationState, it chunkenc.Iterator) chunkenc.Iterator {
        ix := int(id) - int(s.firstChunkID)

        numSamples := c.NumSamples()
        stopAfter := numSamples

        if isoState != nil && !isoState.IsolationDisabled() {
                totalSamples := 0    // Total samples in this series.
                previousSamples := 0 // Samples before this chunk.

                for j, d := range s.mmappedChunks {
                        totalSamples += int(d.numSamples)
                        if j < ix {
                                previousSamples += int(d.numSamples)
                        }
                }

                ix -= len(s.mmappedChunks)
                if s.headChunks != nil {
                        // Iterate all head chunks from the oldest to the newest.
                        headChunksLen := s.headChunks.len()
                        for j := headChunksLen - 1; j >= 0; j-- {
                                chk := s.headChunks.atOffset(j)
                                chkSamples := chk.chunk.NumSamples()
                                totalSamples += chkSamples
                                // Chunk ID is len(s.mmappedChunks) + $(headChunks list position).
                                // Where $(headChunks list position) is zero for the oldest chunk and $(s.headChunks.len() - 1)
                                // for the newest (open) chunk.
                                if headChunksLen-1-j < ix {
                                        previousSamples += chkSamples
                                }
                        }
                }

                // Removing the extra transactionIDs that are relevant for samples that
                // come after this chunk, from the total transactionIDs.
                appendIDsToConsider := int(s.txs.txIDCount) - (totalSamples - (previousSamples + numSamples))

                // Iterate over the appendIDs, find the first one that the isolation state says not
                // to return.
                it := s.txs.iterator()
                for index := 0; index < appendIDsToConsider; index++ {
                        appendID := it.At()
                        if appendID <= isoState.maxAppendID { // Easy check first.
                                if _, ok := isoState.incompleteAppends[appendID]; !ok {
                                        it.Next()
                                        continue
                                }
                        }
                        stopAfter = numSamples - (appendIDsToConsider - index)
                        if stopAfter < 0 {
                                stopAfter = 0 // Stopped in a previous chunk.
                        }
                        break
                }
        }

        if stopAfter == 0 {
                return chunkenc.NewNopIterator()
        }
        if stopAfter == numSamples {
                return c.Iterator(it)
        }
        return makeStopIterator(c, it, stopAfter)
}

// stopIterator wraps an Iterator, but only returns the first
// stopAfter values, if initialized with i=-1.
type stopIterator struct {
        chunkenc.Iterator

        i, stopAfter int
}

func (it *stopIterator) Next() chunkenc.ValueType {
        if it.i+1 >= it.stopAfter {
                return chunkenc.ValNone
        }
        it.i++
        return it.Iterator.Next()
}

func makeStopIterator(c chunkenc.Chunk, it chunkenc.Iterator, stopAfter int) chunkenc.Iterator {
        // Re-use the Iterator object if it is a stopIterator.
        if stopIter, ok := it.(*stopIterator); ok {
                stopIter.Iterator = c.Iterator(stopIter.Iterator)
                stopIter.i = -1
                stopIter.stopAfter = stopAfter
                return stopIter
        }

        return &stopIterator{
                Iterator:  c.Iterator(it),
                i:         -1,
                stopAfter: stopAfter,
        }
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "errors"
        "fmt"
        "math"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/go-kit/log/level"
        "go.uber.org/atomic"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/metadata"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        "github.com/prometheus/prometheus/tsdb/encoding"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
        "github.com/prometheus/prometheus/tsdb/record"
        "github.com/prometheus/prometheus/tsdb/tombstones"
        "github.com/prometheus/prometheus/tsdb/wlog"
        "github.com/prometheus/prometheus/util/zeropool"
)

// histogramRecord combines both RefHistogramSample and RefFloatHistogramSample
// to simplify the WAL replay.
type histogramRecord struct {
        ref chunks.HeadSeriesRef
        t   int64
        h   *histogram.Histogram
        fh  *histogram.FloatHistogram
}

func (h *Head) loadWAL(r *wlog.Reader, syms *labels.SymbolTable, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) {
        // Track number of samples that referenced a series we don't know about
        // for error reporting.
        var unknownRefs atomic.Uint64
        var unknownExemplarRefs atomic.Uint64
        var unknownHistogramRefs atomic.Uint64
        var unknownMetadataRefs atomic.Uint64
        // Track number of series records that had overlapping m-map chunks.
        var mmapOverlappingChunks atomic.Uint64

        // Start workers that each process samples for a partition of the series ID space.
        var (
                wg             sync.WaitGroup
                concurrency    = h.opts.WALReplayConcurrency
                processors     = make([]walSubsetProcessor, concurrency)
                exemplarsInput chan record.RefExemplar

                shards          = make([][]record.RefSample, concurrency)
                histogramShards = make([][]histogramRecord, concurrency)

                decoded                      = make(chan interface{}, 10)
                decodeErr, seriesCreationErr error

                seriesPool          zeropool.Pool[[]record.RefSeries]
                samplesPool         zeropool.Pool[[]record.RefSample]
                tstonesPool         zeropool.Pool[[]tombstones.Stone]
                exemplarsPool       zeropool.Pool[[]record.RefExemplar]
                histogramsPool      zeropool.Pool[[]record.RefHistogramSample]
                floatHistogramsPool zeropool.Pool[[]record.RefFloatHistogramSample]
                metadataPool        zeropool.Pool[[]record.RefMetadata]
        )

        defer func() {
                // For CorruptionErr ensure to terminate all workers before exiting.
                _, ok := err.(*wlog.CorruptionErr)
                if ok || seriesCreationErr != nil {
                        for i := 0; i < concurrency; i++ {
                                processors[i].closeAndDrain()
                        }
                        close(exemplarsInput)
                        wg.Wait()
                }
        }()

        wg.Add(concurrency)
        for i := 0; i < concurrency; i++ {
                processors[i].setup()

                go func(wp *walSubsetProcessor) {
                        unknown, unknownHistograms, overlapping := wp.processWALSamples(h, mmappedChunks, oooMmappedChunks)
                        unknownRefs.Add(unknown)
                        mmapOverlappingChunks.Add(overlapping)
                        unknownHistogramRefs.Add(unknownHistograms)
                        wg.Done()
                }(&processors[i])
        }

        wg.Add(1)
        exemplarsInput = make(chan record.RefExemplar, 300)
        go func(input <-chan record.RefExemplar) {
                var err error
                defer wg.Done()
                for e := range input {
                        ms := h.series.getByID(e.Ref)
                        if ms == nil {
                                unknownExemplarRefs.Inc()
                                continue
                        }

                        if e.T < h.minValidTime.Load() {
                                continue
                        }
                        // At the moment the only possible error here is out of order exemplars, which we shouldn't see when
                        // replaying the WAL, so lets just log the error if it's not that type.
                        err = h.exemplars.AddExemplar(ms.lset, exemplar.Exemplar{Ts: e.T, Value: e.V, Labels: e.Labels})
                        if err != nil && errors.Is(err, storage.ErrOutOfOrderExemplar) {
                                level.Warn(h.logger).Log("msg", "Unexpected error when replaying WAL on exemplar record", "err", err)
                        }
                }
        }(exemplarsInput)

        go func() {
                defer close(decoded)
                var err error
                dec := record.NewDecoder(syms)
                for r.Next() {
                        rec := r.Record()
                        switch dec.Type(rec) {
                        case record.Series:
                                series := seriesPool.Get()[:0]
                                series, err = dec.Series(rec, series)
                                if err != nil {
                                        decodeErr = &wlog.CorruptionErr{
                                                Err:     fmt.Errorf("decode series: %w", err),
                                                Segment: r.Segment(),
                                                Offset:  r.Offset(),
                                        }
                                        return
                                }
                                decoded <- series
                        case record.Samples:
                                samples := samplesPool.Get()[:0]
                                samples, err = dec.Samples(rec, samples)
                                if err != nil {
                                        decodeErr = &wlog.CorruptionErr{
                                                Err:     fmt.Errorf("decode samples: %w", err),
                                                Segment: r.Segment(),
                                                Offset:  r.Offset(),
                                        }
                                        return
                                }
                                decoded <- samples
                        case record.Tombstones:
                                tstones := tstonesPool.Get()[:0]
                                tstones, err = dec.Tombstones(rec, tstones)
                                if err != nil {
                                        decodeErr = &wlog.CorruptionErr{
                                                Err:     fmt.Errorf("decode tombstones: %w", err),
                                                Segment: r.Segment(),
                                                Offset:  r.Offset(),
                                        }
                                        return
                                }
                                decoded <- tstones
                        case record.Exemplars:
                                exemplars := exemplarsPool.Get()[:0]
                                exemplars, err = dec.Exemplars(rec, exemplars)
                                if err != nil {
                                        decodeErr = &wlog.CorruptionErr{
                                                Err:     fmt.Errorf("decode exemplars: %w", err),
                                                Segment: r.Segment(),
                                                Offset:  r.Offset(),
                                        }
                                        return
                                }
                                decoded <- exemplars
                        case record.HistogramSamples:
                                hists := histogramsPool.Get()[:0]
                                hists, err = dec.HistogramSamples(rec, hists)
                                if err != nil {
                                        decodeErr = &wlog.CorruptionErr{
                                                Err:     fmt.Errorf("decode histograms: %w", err),
                                                Segment: r.Segment(),
                                                Offset:  r.Offset(),
                                        }
                                        return
                                }
                                decoded <- hists
                        case record.FloatHistogramSamples:
                                hists := floatHistogramsPool.Get()[:0]
                                hists, err = dec.FloatHistogramSamples(rec, hists)
                                if err != nil {
                                        decodeErr = &wlog.CorruptionErr{
                                                Err:     fmt.Errorf("decode float histograms: %w", err),
                                                Segment: r.Segment(),
                                                Offset:  r.Offset(),
                                        }
                                        return
                                }
                                decoded <- hists
                        case record.Metadata:
                                meta := metadataPool.Get()[:0]
                                meta, err := dec.Metadata(rec, meta)
                                if err != nil {
                                        decodeErr = &wlog.CorruptionErr{
                                                Err:     fmt.Errorf("decode metadata: %w", err),
                                                Segment: r.Segment(),
                                                Offset:  r.Offset(),
                                        }
                                        return
                                }
                                decoded <- meta
                        default:
                                // Noop.
                        }
                }
        }()

        // The records are always replayed from the oldest to the newest.
Outer:
        for d := range decoded {
                switch v := d.(type) {
                case []record.RefSeries:
                        for _, walSeries := range v {
                                mSeries, created, err := h.getOrCreateWithID(walSeries.Ref, walSeries.Labels.Hash(), walSeries.Labels)
                                if err != nil {
                                        seriesCreationErr = err
                                        break Outer
                                }

                                if chunks.HeadSeriesRef(h.lastSeriesID.Load()) < walSeries.Ref {
                                        h.lastSeriesID.Store(uint64(walSeries.Ref))
                                }
                                if !created {
                                        multiRef[walSeries.Ref] = mSeries.ref
                                }

                                idx := uint64(mSeries.ref) % uint64(concurrency)
                                processors[idx].input <- walSubsetProcessorInputItem{walSeriesRef: walSeries.Ref, existingSeries: mSeries}
                        }
                        seriesPool.Put(v)
                case []record.RefSample:
                        samples := v
                        minValidTime := h.minValidTime.Load()
                        // We split up the samples into chunks of 5000 samples or less.
                        // With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
                        // cause thousands of very large in flight buffers occupying large amounts
                        // of unused memory.
                        for len(samples) > 0 {
                                m := 5000
                                if len(samples) < m {
                                        m = len(samples)
                                }
                                for i := 0; i < concurrency; i++ {
                                        if shards[i] == nil {
                                                shards[i] = processors[i].reuseBuf()
                                        }
                                }
                                for _, sam := range samples[:m] {
                                        if sam.T < minValidTime {
                                                continue // Before minValidTime: discard.
                                        }
                                        if r, ok := multiRef[sam.Ref]; ok {
                                                sam.Ref = r
                                        }
                                        mod := uint64(sam.Ref) % uint64(concurrency)
                                        shards[mod] = append(shards[mod], sam)
                                }
                                for i := 0; i < concurrency; i++ {
                                        if len(shards[i]) > 0 {
                                                processors[i].input <- walSubsetProcessorInputItem{samples: shards[i]}
                                                shards[i] = nil
                                        }
                                }
                                samples = samples[m:]
                        }
                        samplesPool.Put(v)
                case []tombstones.Stone:
                        for _, s := range v {
                                for _, itv := range s.Intervals {
                                        if itv.Maxt < h.minValidTime.Load() {
                                                continue
                                        }
                                        if m := h.series.getByID(chunks.HeadSeriesRef(s.Ref)); m == nil {
                                                unknownRefs.Inc()
                                                continue
                                        }
                                        h.tombstones.AddInterval(s.Ref, itv)
                                }
                        }
                        tstonesPool.Put(v)
                case []record.RefExemplar:
                        for _, e := range v {
                                exemplarsInput <- e
                        }
                        exemplarsPool.Put(v)
                case []record.RefHistogramSample:
                        samples := v
                        minValidTime := h.minValidTime.Load()
                        // We split up the samples into chunks of 5000 samples or less.
                        // With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
                        // cause thousands of very large in flight buffers occupying large amounts
                        // of unused memory.
                        for len(samples) > 0 {
                                m := 5000
                                if len(samples) < m {
                                        m = len(samples)
                                }
                                for i := 0; i < concurrency; i++ {
                                        if histogramShards[i] == nil {
                                                histogramShards[i] = processors[i].reuseHistogramBuf()
                                        }
                                }
                                for _, sam := range samples[:m] {
                                        if sam.T < minValidTime {
                                                continue // Before minValidTime: discard.
                                        }
                                        if r, ok := multiRef[sam.Ref]; ok {
                                                sam.Ref = r
                                        }
                                        mod := uint64(sam.Ref) % uint64(concurrency)
                                        histogramShards[mod] = append(histogramShards[mod], histogramRecord{ref: sam.Ref, t: sam.T, h: sam.H})
                                }
                                for i := 0; i < concurrency; i++ {
                                        if len(histogramShards[i]) > 0 {
                                                processors[i].input <- walSubsetProcessorInputItem{histogramSamples: histogramShards[i]}
                                                histogramShards[i] = nil
                                        }
                                }
                                samples = samples[m:]
                        }
                        histogramsPool.Put(v)
                case []record.RefFloatHistogramSample:
                        samples := v
                        minValidTime := h.minValidTime.Load()
                        // We split up the samples into chunks of 5000 samples or less.
                        // With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
                        // cause thousands of very large in flight buffers occupying large amounts
                        // of unused memory.
                        for len(samples) > 0 {
                                m := 5000
                                if len(samples) < m {
                                        m = len(samples)
                                }
                                for i := 0; i < concurrency; i++ {
                                        if histogramShards[i] == nil {
                                                histogramShards[i] = processors[i].reuseHistogramBuf()
                                        }
                                }
                                for _, sam := range samples[:m] {
                                        if sam.T < minValidTime {
                                                continue // Before minValidTime: discard.
                                        }
                                        if r, ok := multiRef[sam.Ref]; ok {
                                                sam.Ref = r
                                        }
                                        mod := uint64(sam.Ref) % uint64(concurrency)
                                        histogramShards[mod] = append(histogramShards[mod], histogramRecord{ref: sam.Ref, t: sam.T, fh: sam.FH})
                                }
                                for i := 0; i < concurrency; i++ {
                                        if len(histogramShards[i]) > 0 {
                                                processors[i].input <- walSubsetProcessorInputItem{histogramSamples: histogramShards[i]}
                                                histogramShards[i] = nil
                                        }
                                }
                                samples = samples[m:]
                        }
                        floatHistogramsPool.Put(v)
                case []record.RefMetadata:
                        for _, m := range v {
                                s := h.series.getByID(m.Ref)
                                if s == nil {
                                        unknownMetadataRefs.Inc()
                                        continue
                                }
                                s.meta = &metadata.Metadata{
                                        Type: record.ToMetricType(m.Type),
                                        Unit: m.Unit,
                                        Help: m.Help,
                                }
                        }
                        metadataPool.Put(v)
                default:
                        panic(fmt.Errorf("unexpected decoded type: %T", d))
                }
        }

        if decodeErr != nil {
                return decodeErr
        }
        if seriesCreationErr != nil {
                // Drain the channel to unblock the goroutine.
                for range decoded {
                }
                return seriesCreationErr
        }

        // Signal termination to each worker and wait for it to close its output channel.
        for i := 0; i < concurrency; i++ {
                processors[i].closeAndDrain()
        }
        close(exemplarsInput)
        wg.Wait()

        if err := r.Err(); err != nil {
                return fmt.Errorf("read records: %w", err)
        }

        if unknownRefs.Load()+unknownExemplarRefs.Load()+unknownHistogramRefs.Load()+unknownMetadataRefs.Load() > 0 {
                level.Warn(h.logger).Log(
                        "msg", "Unknown series references",
                        "samples", unknownRefs.Load(),
                        "exemplars", unknownExemplarRefs.Load(),
                        "histograms", unknownHistogramRefs.Load(),
                        "metadata", unknownMetadataRefs.Load(),
                )
        }
        if count := mmapOverlappingChunks.Load(); count > 0 {
                level.Info(h.logger).Log("msg", "Overlapping m-map chunks on duplicate series records", "count", count)
        }
        return nil
}

// resetSeriesWithMMappedChunks is only used during the WAL replay.
func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc, oooMmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) {
        if mSeries.ref != walSeriesRef {
                // Checking if the new m-mapped chunks overlap with the already existing ones.
                if len(mSeries.mmappedChunks) > 0 && len(mmc) > 0 {
                        if overlapsClosedInterval(
                                mSeries.mmappedChunks[0].minTime,
                                mSeries.mmappedChunks[len(mSeries.mmappedChunks)-1].maxTime,
                                mmc[0].minTime,
                                mmc[len(mmc)-1].maxTime,
                        ) {
                                level.Debug(h.logger).Log(
                                        "msg", "M-mapped chunks overlap on a duplicate series record",
                                        "series", mSeries.lset.String(),
                                        "oldref", mSeries.ref,
                                        "oldmint", mSeries.mmappedChunks[0].minTime,
                                        "oldmaxt", mSeries.mmappedChunks[len(mSeries.mmappedChunks)-1].maxTime,
                                        "newref", walSeriesRef,
                                        "newmint", mmc[0].minTime,
                                        "newmaxt", mmc[len(mmc)-1].maxTime,
                                )
                                overlapped = true
                        }
                }
        }

        h.metrics.chunksCreated.Add(float64(len(mmc) + len(oooMmc)))
        h.metrics.chunksRemoved.Add(float64(len(mSeries.mmappedChunks)))
        h.metrics.chunks.Add(float64(len(mmc) + len(oooMmc) - len(mSeries.mmappedChunks)))

        if mSeries.ooo != nil {
                h.metrics.chunksRemoved.Add(float64(len(mSeries.ooo.oooMmappedChunks)))
                h.metrics.chunks.Sub(float64(len(mSeries.ooo.oooMmappedChunks)))
        }

        mSeries.mmappedChunks = mmc
        if len(oooMmc) == 0 {
                mSeries.ooo = nil
        } else {
                if mSeries.ooo == nil {
                        mSeries.ooo = &memSeriesOOOFields{}
                }
                *mSeries.ooo = memSeriesOOOFields{oooMmappedChunks: oooMmc}
        }
        // Cache the last mmapped chunk time, so we can skip calling append() for samples it will reject.
        if len(mmc) == 0 {
                mSeries.mmMaxTime = math.MinInt64
        } else {
                mSeries.mmMaxTime = mmc[len(mmc)-1].maxTime
                h.updateMinMaxTime(mmc[0].minTime, mSeries.mmMaxTime)
        }
        if len(oooMmc) != 0 {
                // Mint and maxt can be in any chunk, they are not sorted.
                mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
                for _, ch := range oooMmc {
                        if ch.minTime < mint {
                                mint = ch.minTime
                        }
                        if ch.maxTime > maxt {
                                maxt = ch.maxTime
                        }
                }
                h.updateMinOOOMaxOOOTime(mint, maxt)
        }

        // Any samples replayed till now would already be compacted. Resetting the head chunk.
        mSeries.nextAt = 0
        mSeries.headChunks = nil
        mSeries.app = nil
        return
}

type walSubsetProcessor struct {
        input            chan walSubsetProcessorInputItem
        output           chan []record.RefSample
        histogramsOutput chan []histogramRecord
}

type walSubsetProcessorInputItem struct {
        samples          []record.RefSample
        histogramSamples []histogramRecord
        existingSeries   *memSeries
        walSeriesRef     chunks.HeadSeriesRef
}

func (wp *walSubsetProcessor) setup() {
        wp.input = make(chan walSubsetProcessorInputItem, 300)
        wp.output = make(chan []record.RefSample, 300)
        wp.histogramsOutput = make(chan []histogramRecord, 300)
}

func (wp *walSubsetProcessor) closeAndDrain() {
        close(wp.input)
        for range wp.output {
        }
        for range wp.histogramsOutput {
        }
}

// If there is a buffer in the output chan, return it for reuse, otherwise return nil.
func (wp *walSubsetProcessor) reuseBuf() []record.RefSample {
        select {
        case buf := <-wp.output:
                return buf[:0]
        default:
        }
        return nil
}

// If there is a buffer in the output chan, return it for reuse, otherwise return nil.
func (wp *walSubsetProcessor) reuseHistogramBuf() []histogramRecord {
        select {
        case buf := <-wp.histogramsOutput:
                return buf[:0]
        default:
        }
        return nil
}

// processWALSamples adds the samples it receives to the head and passes
// the buffer received to an output channel for reuse.
// Samples before the minValidTime timestamp are discarded.
func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, unknownHistogramRefs, mmapOverlappingChunks uint64) {
        defer close(wp.output)
        defer close(wp.histogramsOutput)

        minValidTime := h.minValidTime.Load()
        mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
        appendChunkOpts := chunkOpts{
                chunkDiskMapper: h.chunkDiskMapper,
                chunkRange:      h.chunkRange.Load(),
                samplesPerChunk: h.opts.SamplesPerChunk,
        }

        for in := range wp.input {
                if in.existingSeries != nil {
                        mmc := mmappedChunks[in.walSeriesRef]
                        oooMmc := oooMmappedChunks[in.walSeriesRef]
                        if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, oooMmc, in.walSeriesRef) {
                                mmapOverlappingChunks++
                        }
                        continue
                }

                for _, s := range in.samples {
                        ms := h.series.getByID(s.Ref)
                        if ms == nil {
                                unknownRefs++
                                continue
                        }
                        if s.T <= ms.mmMaxTime {
                                continue
                        }
                        if _, chunkCreated := ms.append(s.T, s.V, 0, appendChunkOpts); chunkCreated {
                                h.metrics.chunksCreated.Inc()
                                h.metrics.chunks.Inc()
                                _ = ms.mmapChunks(h.chunkDiskMapper)
                        }
                        if s.T > maxt {
                                maxt = s.T
                        }
                        if s.T < mint {
                                mint = s.T
                        }
                }
                select {
                case wp.output <- in.samples:
                default:
                }

                for _, s := range in.histogramSamples {
                        if s.t < minValidTime {
                                continue
                        }
                        ms := h.series.getByID(s.ref)
                        if ms == nil {
                                unknownHistogramRefs++
                                continue
                        }
                        if s.t <= ms.mmMaxTime {
                                continue
                        }
                        var chunkCreated bool
                        if s.h != nil {
                                _, chunkCreated = ms.appendHistogram(s.t, s.h, 0, appendChunkOpts)
                        } else {
                                _, chunkCreated = ms.appendFloatHistogram(s.t, s.fh, 0, appendChunkOpts)
                        }
                        if chunkCreated {
                                h.metrics.chunksCreated.Inc()
                                h.metrics.chunks.Inc()
                        }
                        if s.t > maxt {
                                maxt = s.t
                        }
                        if s.t < mint {
                                mint = s.t
                        }
                }

                select {
                case wp.histogramsOutput <- in.histogramSamples:
                default:
                }
        }
        h.updateMinMaxTime(mint, maxt)

        return unknownRefs, unknownHistogramRefs, mmapOverlappingChunks
}

func (h *Head) loadWBL(r *wlog.Reader, syms *labels.SymbolTable, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, lastMmapRef chunks.ChunkDiskMapperRef) (err error) {
        // Track number of samples, m-map markers, that referenced a series we don't know about
        // for error reporting.
        var unknownRefs, mmapMarkerUnknownRefs atomic.Uint64

        lastSeq, lastOff := lastMmapRef.Unpack()
        // Start workers that each process samples for a partition of the series ID space.
        var (
                wg          sync.WaitGroup
                concurrency = h.opts.WALReplayConcurrency
                processors  = make([]wblSubsetProcessor, concurrency)

                dec    = record.NewDecoder(syms)
                shards = make([][]record.RefSample, concurrency)

                decodedCh   = make(chan interface{}, 10)
                decodeErr   error
                samplesPool = sync.Pool{
                        New: func() interface{} {
                                return []record.RefSample{}
                        },
                }
                markersPool = sync.Pool{
                        New: func() interface{} {
                                return []record.RefMmapMarker{}
                        },
                }
        )

        defer func() {
                // For CorruptionErr ensure to terminate all workers before exiting.
                // We also wrap it to identify OOO WBL corruption.
                _, ok := err.(*wlog.CorruptionErr)
                if ok {
                        err = &errLoadWbl{err: err}
                        for i := 0; i < concurrency; i++ {
                                processors[i].closeAndDrain()
                        }
                        wg.Wait()
                }
        }()

        wg.Add(concurrency)
        for i := 0; i < concurrency; i++ {
                processors[i].setup()

                go func(wp *wblSubsetProcessor) {
                        unknown := wp.processWBLSamples(h)
                        unknownRefs.Add(unknown)
                        wg.Done()
                }(&processors[i])
        }

        go func() {
                defer close(decodedCh)
                for r.Next() {
                        rec := r.Record()
                        switch dec.Type(rec) {
                        case record.Samples:
                                samples := samplesPool.Get().([]record.RefSample)[:0]
                                samples, err = dec.Samples(rec, samples)
                                if err != nil {
                                        decodeErr = &wlog.CorruptionErr{
                                                Err:     fmt.Errorf("decode samples: %w", err),
                                                Segment: r.Segment(),
                                                Offset:  r.Offset(),
                                        }
                                        return
                                }
                                decodedCh <- samples
                        case record.MmapMarkers:
                                markers := markersPool.Get().([]record.RefMmapMarker)[:0]
                                markers, err = dec.MmapMarkers(rec, markers)
                                if err != nil {
                                        decodeErr = &wlog.CorruptionErr{
                                                Err:     fmt.Errorf("decode mmap markers: %w", err),
                                                Segment: r.Segment(),
                                                Offset:  r.Offset(),
                                        }
                                        return
                                }
                                decodedCh <- markers
                        default:
                                // Noop.
                        }
                }
        }()

        // The records are always replayed from the oldest to the newest.
        for d := range decodedCh {
                switch v := d.(type) {
                case []record.RefSample:
                        samples := v
                        // We split up the samples into parts of 5000 samples or less.
                        // With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
                        // cause thousands of very large in flight buffers occupying large amounts
                        // of unused memory.
                        for len(samples) > 0 {
                                m := 5000
                                if len(samples) < m {
                                        m = len(samples)
                                }
                                for i := 0; i < concurrency; i++ {
                                        if shards[i] == nil {
                                                shards[i] = processors[i].reuseBuf()
                                        }
                                }
                                for _, sam := range samples[:m] {
                                        if r, ok := multiRef[sam.Ref]; ok {
                                                sam.Ref = r
                                        }
                                        mod := uint64(sam.Ref) % uint64(concurrency)
                                        shards[mod] = append(shards[mod], sam)
                                }
                                for i := 0; i < concurrency; i++ {
                                        if len(shards[i]) > 0 {
                                                processors[i].input <- wblSubsetProcessorInputItem{samples: shards[i]}
                                                shards[i] = nil
                                        }
                                }
                                samples = samples[m:]
                        }
                        samplesPool.Put(d)
                case []record.RefMmapMarker:
                        markers := v
                        for _, rm := range markers {
                                seq, off := rm.MmapRef.Unpack()
                                if seq > lastSeq || (seq == lastSeq && off > lastOff) {
                                        // This m-map chunk from markers was not present during
                                        // the load of mmapped chunks that happened in the head
                                        // initialization.
                                        continue
                                }

                                if r, ok := multiRef[rm.Ref]; ok {
                                        rm.Ref = r
                                }

                                ms := h.series.getByID(rm.Ref)
                                if ms == nil {
                                        mmapMarkerUnknownRefs.Inc()
                                        continue
                                }
                                idx := uint64(ms.ref) % uint64(concurrency)
                                processors[idx].input <- wblSubsetProcessorInputItem{mmappedSeries: ms}
                        }
                default:
                        panic(fmt.Errorf("unexpected decodedCh type: %T", d))
                }
        }

        if decodeErr != nil {
                return decodeErr
        }

        // Signal termination to each worker and wait for it to close its output channel.
        for i := 0; i < concurrency; i++ {
                processors[i].closeAndDrain()
        }
        wg.Wait()

        if err := r.Err(); err != nil {
                return fmt.Errorf("read records: %w", err)
        }

        if unknownRefs.Load() > 0 || mmapMarkerUnknownRefs.Load() > 0 {
                level.Warn(h.logger).Log("msg", "Unknown series references for ooo WAL replay", "samples", unknownRefs.Load(), "mmap_markers", mmapMarkerUnknownRefs.Load())
        }
        return nil
}

type errLoadWbl struct {
        err error
}

func (e errLoadWbl) Error() string {
        return e.err.Error()
}

func (e errLoadWbl) Cause() error {
        return e.err
}

func (e errLoadWbl) Unwrap() error {
        return e.err
}

type wblSubsetProcessor struct {
        input  chan wblSubsetProcessorInputItem
        output chan []record.RefSample
}

type wblSubsetProcessorInputItem struct {
        mmappedSeries *memSeries
        samples       []record.RefSample
}

func (wp *wblSubsetProcessor) setup() {
        wp.output = make(chan []record.RefSample, 300)
        wp.input = make(chan wblSubsetProcessorInputItem, 300)
}

func (wp *wblSubsetProcessor) closeAndDrain() {
        close(wp.input)
        for range wp.output {
        }
}

// If there is a buffer in the output chan, return it for reuse, otherwise return nil.
func (wp *wblSubsetProcessor) reuseBuf() []record.RefSample {
        select {
        case buf := <-wp.output:
                return buf[:0]
        default:
        }
        return nil
}

// processWBLSamples adds the samples it receives to the head and passes
// the buffer received to an output channel for reuse.
func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (unknownRefs uint64) {
        defer close(wp.output)

        oooCapMax := h.opts.OutOfOrderCapMax.Load()
        // We don't check for minValidTime for ooo samples.
        mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
        for in := range wp.input {
                if in.mmappedSeries != nil && in.mmappedSeries.ooo != nil {
                        // All samples till now have been m-mapped. Hence clear out the headChunk.
                        // In case some samples slipped through and went into m-map chunks because of changed
                        // chunk size parameters, we are not taking care of that here.
                        // TODO(codesome): see if there is a way to avoid duplicate m-map chunks if
                        // the size of ooo chunk was reduced between restart.
                        in.mmappedSeries.ooo.oooHeadChunk = nil
                        continue
                }
                for _, s := range in.samples {
                        ms := h.series.getByID(s.Ref)
                        if ms == nil {
                                unknownRefs++
                                continue
                        }
                        ok, chunkCreated, _ := ms.insert(s.T, s.V, h.chunkDiskMapper, oooCapMax)
                        if chunkCreated {
                                h.metrics.chunksCreated.Inc()
                                h.metrics.chunks.Inc()
                        }
                        if ok {
                                if s.T < mint {
                                        mint = s.T
                                }
                                if s.T > maxt {
                                        maxt = s.T
                                }
                        }
                }
                select {
                case wp.output <- in.samples:
                default:
                }
        }

        h.updateMinOOOMaxOOOTime(mint, maxt)

        return unknownRefs
}

const (
        chunkSnapshotRecordTypeSeries     uint8 = 1
        chunkSnapshotRecordTypeTombstones uint8 = 2
        chunkSnapshotRecordTypeExemplars  uint8 = 3
)

type chunkSnapshotRecord struct {
        ref                     chunks.HeadSeriesRef
        lset                    labels.Labels
        mc                      *memChunk
        lastValue               float64
        lastHistogramValue      *histogram.Histogram
        lastFloatHistogramValue *histogram.FloatHistogram
}

func (s *memSeries) encodeToSnapshotRecord(b []byte) []byte {
        buf := encoding.Encbuf{B: b}

        buf.PutByte(chunkSnapshotRecordTypeSeries)
        buf.PutBE64(uint64(s.ref))
        record.EncodeLabels(&buf, s.lset)
        buf.PutBE64int64(0) // Backwards-compatibility; was chunkRange but now unused.

        s.Lock()
        if s.headChunks == nil {
                buf.PutUvarint(0)
        } else {
                enc := s.headChunks.chunk.Encoding()
                buf.PutUvarint(1)
                buf.PutBE64int64(s.headChunks.minTime)
                buf.PutBE64int64(s.headChunks.maxTime)
                buf.PutByte(byte(enc))
                buf.PutUvarintBytes(s.headChunks.chunk.Bytes())

                switch enc {
                case chunkenc.EncXOR:
                        // Backwards compatibility for old sampleBuf which had last 4 samples.
                        for i := 0; i < 3; i++ {
                                buf.PutBE64int64(0)
                                buf.PutBEFloat64(0)
                        }
                        buf.PutBE64int64(0)
                        buf.PutBEFloat64(s.lastValue)
                case chunkenc.EncHistogram:
                        record.EncodeHistogram(&buf, s.lastHistogramValue)
                default: // chunkenc.FloatHistogram.
                        record.EncodeFloatHistogram(&buf, s.lastFloatHistogramValue)
                }
        }
        s.Unlock()

        return buf.Get()
}

func decodeSeriesFromChunkSnapshot(d *record.Decoder, b []byte) (csr chunkSnapshotRecord, err error) {
        dec := encoding.Decbuf{B: b}

        if flag := dec.Byte(); flag != chunkSnapshotRecordTypeSeries {
                return csr, fmt.Errorf("invalid record type %x", flag)
        }

        csr.ref = chunks.HeadSeriesRef(dec.Be64())
        // The label set written to the disk is already sorted.
        // TODO: figure out why DecodeLabels calls Sort(), and perhaps remove it.
        csr.lset = d.DecodeLabels(&dec)

        _ = dec.Be64int64() // Was chunkRange but now unused.
        if dec.Uvarint() == 0 {
                return
        }

        csr.mc = &memChunk{}
        csr.mc.minTime = dec.Be64int64()
        csr.mc.maxTime = dec.Be64int64()
        enc := chunkenc.Encoding(dec.Byte())

        // The underlying bytes gets re-used later, so make a copy.
        chunkBytes := dec.UvarintBytes()
        chunkBytesCopy := make([]byte, len(chunkBytes))
        copy(chunkBytesCopy, chunkBytes)

        chk, err := chunkenc.FromData(enc, chunkBytesCopy)
        if err != nil {
                return csr, fmt.Errorf("chunk from data: %w", err)
        }
        csr.mc.chunk = chk

        switch enc {
        case chunkenc.EncXOR:
                // Backwards-compatibility for old sampleBuf which had last 4 samples.
                for i := 0; i < 3; i++ {
                        _ = dec.Be64int64()
                        _ = dec.Be64Float64()
                }
                _ = dec.Be64int64()
                csr.lastValue = dec.Be64Float64()
        case chunkenc.EncHistogram:
                csr.lastHistogramValue = &histogram.Histogram{}
                record.DecodeHistogram(&dec, csr.lastHistogramValue)
        default: // chunkenc.FloatHistogram.
                csr.lastFloatHistogramValue = &histogram.FloatHistogram{}
                record.DecodeFloatHistogram(&dec, csr.lastFloatHistogramValue)
        }

        err = dec.Err()
        if err != nil && len(dec.B) > 0 {
                err = fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
        }

        return
}

func encodeTombstonesToSnapshotRecord(tr tombstones.Reader) ([]byte, error) {
        buf := encoding.Encbuf{}

        buf.PutByte(chunkSnapshotRecordTypeTombstones)
        b, err := tombstones.Encode(tr)
        if err != nil {
                return nil, fmt.Errorf("encode tombstones: %w", err)
        }
        buf.PutUvarintBytes(b)

        return buf.Get(), nil
}

func decodeTombstonesSnapshotRecord(b []byte) (tombstones.Reader, error) {
        dec := encoding.Decbuf{B: b}

        if flag := dec.Byte(); flag != chunkSnapshotRecordTypeTombstones {
                return nil, fmt.Errorf("invalid record type %x", flag)
        }

        tr, err := tombstones.Decode(dec.UvarintBytes())
        if err != nil {
                return tr, fmt.Errorf("decode tombstones: %w", err)
        }
        return tr, nil
}

const chunkSnapshotPrefix = "chunk_snapshot."

// ChunkSnapshot creates a snapshot of all the series and tombstones in the head.
// It deletes the old chunk snapshots if the chunk snapshot creation is successful.
//
// The chunk snapshot is stored in a directory named chunk_snapshot.N.M and is written
// using the WAL package. N is the last WAL segment present during snapshotting and
// M is the offset in segment N upto which data was written.
//
// The snapshot first contains all series (each in individual records and not sorted), followed by
// tombstones (a single record), and finally exemplars (>= 1 record). Exemplars are in the order they
// were written to the circular buffer.
func (h *Head) ChunkSnapshot() (*ChunkSnapshotStats, error) {
        if h.wal == nil {
                // If we are not storing any WAL, does not make sense to take a snapshot too.
                level.Warn(h.logger).Log("msg", "skipping chunk snapshotting as WAL is disabled")
                return &ChunkSnapshotStats{}, nil
        }
        h.chunkSnapshotMtx.Lock()
        defer h.chunkSnapshotMtx.Unlock()

        stats := &ChunkSnapshotStats{}

        wlast, woffset, err := h.wal.LastSegmentAndOffset()
        if err != nil && !errors.Is(err, record.ErrNotFound) {
                return stats, fmt.Errorf("get last wal segment and offset: %w", err)
        }

        _, cslast, csoffset, err := LastChunkSnapshot(h.opts.ChunkDirRoot)
        if err != nil && !errors.Is(err, record.ErrNotFound) {
                return stats, fmt.Errorf("find last chunk snapshot: %w", err)
        }

        if wlast == cslast && woffset == csoffset {
                // Nothing has been written to the WAL/Head since the last snapshot.
                return stats, nil
        }

        snapshotName := chunkSnapshotDir(wlast, woffset)

        cpdir := filepath.Join(h.opts.ChunkDirRoot, snapshotName)
        cpdirtmp := cpdir + ".tmp"
        stats.Dir = cpdir

        if err := os.MkdirAll(cpdirtmp, 0o777); err != nil {
                return stats, fmt.Errorf("create chunk snapshot dir: %w", err)
        }
        cp, err := wlog.New(nil, nil, cpdirtmp, h.wal.CompressionType())
        if err != nil {
                return stats, fmt.Errorf("open chunk snapshot: %w", err)
        }

        // Ensures that an early return caused by an error doesn't leave any tmp files.
        defer func() {
                cp.Close()
                os.RemoveAll(cpdirtmp)
        }()

        var (
                buf  []byte
                recs [][]byte
        )
        // Add all series to the snapshot.
        stripeSize := h.series.size
        for i := 0; i < stripeSize; i++ {
                h.series.locks[i].RLock()

                for _, s := range h.series.series[i] {
                        start := len(buf)
                        buf = s.encodeToSnapshotRecord(buf)
                        if len(buf[start:]) == 0 {
                                continue // All contents discarded.
                        }
                        recs = append(recs, buf[start:])
                        // Flush records in 10 MB increments.
                        if len(buf) > 10*1024*1024 {
                                if err := cp.Log(recs...); err != nil {
                                        h.series.locks[i].RUnlock()
                                        return stats, fmt.Errorf("flush records: %w", err)
                                }
                                buf, recs = buf[:0], recs[:0]
                        }
                }
                stats.TotalSeries += len(h.series.series[i])

                h.series.locks[i].RUnlock()
        }

        // Add tombstones to the snapshot.
        tombstonesReader, err := h.Tombstones()
        if err != nil {
                return stats, fmt.Errorf("get tombstones: %w", err)
        }
        rec, err := encodeTombstonesToSnapshotRecord(tombstonesReader)
        if err != nil {
                return stats, fmt.Errorf("encode tombstones: %w", err)
        }
        recs = append(recs, rec)
        // Flush remaining series records and tombstones.
        if err := cp.Log(recs...); err != nil {
                return stats, fmt.Errorf("flush records: %w", err)
        }
        buf = buf[:0]

        // Add exemplars in the snapshot.
        // We log in batches, with each record having upto 10000 exemplars.
        // Assuming 100 bytes (overestimate) per exemplar, that's ~1MB.
        maxExemplarsPerRecord := 10000
        batch := make([]record.RefExemplar, 0, maxExemplarsPerRecord)
        enc := record.Encoder{}
        flushExemplars := func() error {
                if len(batch) == 0 {
                        return nil
                }
                buf = buf[:0]
                encbuf := encoding.Encbuf{B: buf}
                encbuf.PutByte(chunkSnapshotRecordTypeExemplars)
                enc.EncodeExemplarsIntoBuffer(batch, &encbuf)
                if err := cp.Log(encbuf.Get()); err != nil {
                        return fmt.Errorf("log exemplars: %w", err)
                }
                buf, batch = buf[:0], batch[:0]
                return nil
        }
        err = h.exemplars.IterateExemplars(func(seriesLabels labels.Labels, e exemplar.Exemplar) error {
                if len(batch) >= maxExemplarsPerRecord {
                        if err := flushExemplars(); err != nil {
                                return fmt.Errorf("flush exemplars: %w", err)
                        }
                }

                ms := h.series.getByHash(seriesLabels.Hash(), seriesLabels)
                if ms == nil {
                        // It is possible that exemplar refers to some old series. We discard such exemplars.
                        return nil
                }
                batch = append(batch, record.RefExemplar{
                        Ref:    ms.ref,
                        T:      e.Ts,
                        V:      e.Value,
                        Labels: e.Labels,
                })
                return nil
        })
        if err != nil {
                return stats, fmt.Errorf("iterate exemplars: %w", err)
        }

        // Flush remaining exemplars.
        if err := flushExemplars(); err != nil {
                return stats, fmt.Errorf("flush exemplars at the end: %w", err)
        }

        if err := cp.Close(); err != nil {
                return stats, fmt.Errorf("close chunk snapshot: %w", err)
        }
        if err := fileutil.Replace(cpdirtmp, cpdir); err != nil {
                return stats, fmt.Errorf("rename chunk snapshot directory: %w", err)
        }

        if err := DeleteChunkSnapshots(h.opts.ChunkDirRoot, wlast, woffset); err != nil {
                // Leftover old chunk snapshots do not cause problems down the line beyond
                // occupying disk space.
                // They will just be ignored since a higher chunk snapshot exists.
                level.Error(h.logger).Log("msg", "delete old chunk snapshots", "err", err)
        }
        return stats, nil
}

func chunkSnapshotDir(wlast, woffset int) string {
        return fmt.Sprintf(chunkSnapshotPrefix+"%06d.%010d", wlast, woffset)
}

func (h *Head) performChunkSnapshot() error {
        level.Info(h.logger).Log("msg", "creating chunk snapshot")
        startTime := time.Now()
        stats, err := h.ChunkSnapshot()
        elapsed := time.Since(startTime)
        if err == nil {
                level.Info(h.logger).Log("msg", "chunk snapshot complete", "duration", elapsed.String(), "num_series", stats.TotalSeries, "dir", stats.Dir)
        }
        if err != nil {
                return fmt.Errorf("chunk snapshot: %w", err)
        }
        return nil
}

// ChunkSnapshotStats returns stats about a created chunk snapshot.
type ChunkSnapshotStats struct {
        TotalSeries int
        Dir         string
}

// LastChunkSnapshot returns the directory name and index of the most recent chunk snapshot.
// If dir does not contain any chunk snapshots, ErrNotFound is returned.
func LastChunkSnapshot(dir string) (string, int, int, error) {
        files, err := os.ReadDir(dir)
        if err != nil {
                return "", 0, 0, err
        }
        maxIdx, maxOffset := -1, -1
        maxFileName := ""
        for i := 0; i < len(files); i++ {
                fi := files[i]

                if !strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) {
                        continue
                }
                if !fi.IsDir() {
                        return "", 0, 0, fmt.Errorf("chunk snapshot %s is not a directory", fi.Name())
                }

                splits := strings.Split(fi.Name()[len(chunkSnapshotPrefix):], ".")
                if len(splits) != 2 {
                        // Chunk snapshots is not in the right format, we do not care about it.
                        continue
                }

                idx, err := strconv.Atoi(splits[0])
                if err != nil {
                        continue
                }

                offset, err := strconv.Atoi(splits[1])
                if err != nil {
                        continue
                }

                if idx > maxIdx || (idx == maxIdx && offset > maxOffset) {
                        maxIdx, maxOffset = idx, offset
                        maxFileName = filepath.Join(dir, fi.Name())
                }
        }
        if maxFileName == "" {
                return "", 0, 0, record.ErrNotFound
        }
        return maxFileName, maxIdx, maxOffset, nil
}

// DeleteChunkSnapshots deletes all chunk snapshots in a directory below a given index.
func DeleteChunkSnapshots(dir string, maxIndex, maxOffset int) error {
        files, err := os.ReadDir(dir)
        if err != nil {
                return err
        }

        errs := tsdb_errors.NewMulti()
        for _, fi := range files {
                if !strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) {
                        continue
                }

                splits := strings.Split(fi.Name()[len(chunkSnapshotPrefix):], ".")
                if len(splits) != 2 {
                        continue
                }

                idx, err := strconv.Atoi(splits[0])
                if err != nil {
                        continue
                }

                offset, err := strconv.Atoi(splits[1])
                if err != nil {
                        continue
                }

                if idx < maxIndex || (idx == maxIndex && offset < maxOffset) {
                        if err := os.RemoveAll(filepath.Join(dir, fi.Name())); err != nil {
                                errs.Add(err)
                        }
                }
        }
        return errs.Err()
}

// loadChunkSnapshot replays the chunk snapshot and restores the Head state from it. If there was any error returned,
// it is the responsibility of the caller to clear the contents of the Head.
func (h *Head) loadChunkSnapshot() (int, int, map[chunks.HeadSeriesRef]*memSeries, error) {
        dir, snapIdx, snapOffset, err := LastChunkSnapshot(h.opts.ChunkDirRoot)
        if err != nil {
                if errors.Is(err, record.ErrNotFound) {
                        return snapIdx, snapOffset, nil, nil
                }
                return snapIdx, snapOffset, nil, fmt.Errorf("find last chunk snapshot: %w", err)
        }

        start := time.Now()
        sr, err := wlog.NewSegmentsReader(dir)
        if err != nil {
                return snapIdx, snapOffset, nil, fmt.Errorf("open chunk snapshot: %w", err)
        }
        defer func() {
                if err := sr.Close(); err != nil {
                        level.Warn(h.logger).Log("msg", "error while closing the wal segments reader", "err", err)
                }
        }()

        var (
                numSeries        = 0
                unknownRefs      = int64(0)
                concurrency      = h.opts.WALReplayConcurrency
                wg               sync.WaitGroup
                recordChan       = make(chan chunkSnapshotRecord, 5*concurrency)
                shardedRefSeries = make([]map[chunks.HeadSeriesRef]*memSeries, concurrency)
                errChan          = make(chan error, concurrency)
                refSeries        map[chunks.HeadSeriesRef]*memSeries
                exemplarBuf      []record.RefExemplar
                syms             = labels.NewSymbolTable() // New table for the whole snapshot.
                dec              = record.NewDecoder(syms)
        )

        wg.Add(concurrency)
        for i := 0; i < concurrency; i++ {
                go func(idx int, rc <-chan chunkSnapshotRecord) {
                        defer wg.Done()
                        defer func() {
                                // If there was an error, drain the channel
                                // to unblock the main thread.
                                for range rc {
                                }
                        }()

                        shardedRefSeries[idx] = make(map[chunks.HeadSeriesRef]*memSeries)
                        localRefSeries := shardedRefSeries[idx]

                        for csr := range rc {
                                series, _, err := h.getOrCreateWithID(csr.ref, csr.lset.Hash(), csr.lset)
                                if err != nil {
                                        errChan <- err
                                        return
                                }
                                localRefSeries[csr.ref] = series
                                for {
                                        seriesID := uint64(series.ref)
                                        lastSeriesID := h.lastSeriesID.Load()
                                        if lastSeriesID >= seriesID || h.lastSeriesID.CompareAndSwap(lastSeriesID, seriesID) {
                                                break
                                        }
                                }

                                if csr.mc == nil {
                                        continue
                                }
                                series.nextAt = csr.mc.maxTime // This will create a new chunk on append.
                                series.headChunks = csr.mc
                                series.lastValue = csr.lastValue
                                series.lastHistogramValue = csr.lastHistogramValue
                                series.lastFloatHistogramValue = csr.lastFloatHistogramValue

                                app, err := series.headChunks.chunk.Appender()
                                if err != nil {
                                        errChan <- err
                                        return
                                }
                                series.app = app

                                h.updateMinMaxTime(csr.mc.minTime, csr.mc.maxTime)
                        }
                }(i, recordChan)
        }

        r := wlog.NewReader(sr)
        var loopErr error
Outer:
        for r.Next() {
                select {
                case err := <-errChan:
                        errChan <- err
                        break Outer
                default:
                }

                rec := r.Record()
                switch rec[0] {
                case chunkSnapshotRecordTypeSeries:
                        numSeries++
                        csr, err := decodeSeriesFromChunkSnapshot(&dec, rec)
                        if err != nil {
                                loopErr = fmt.Errorf("decode series record: %w", err)
                                break Outer
                        }
                        recordChan <- csr

                case chunkSnapshotRecordTypeTombstones:
                        tr, err := decodeTombstonesSnapshotRecord(rec)
                        if err != nil {
                                loopErr = fmt.Errorf("decode tombstones: %w", err)
                                break Outer
                        }

                        if err = tr.Iter(func(ref storage.SeriesRef, ivs tombstones.Intervals) error {
                                h.tombstones.AddInterval(ref, ivs...)
                                return nil
                        }); err != nil {
                                loopErr = fmt.Errorf("iterate tombstones: %w", err)
                                break Outer
                        }

                case chunkSnapshotRecordTypeExemplars:
                        // Exemplars are at the end of snapshot. So all series are loaded at this point.
                        if len(refSeries) == 0 {
                                close(recordChan)
                                wg.Wait()

                                refSeries = make(map[chunks.HeadSeriesRef]*memSeries, numSeries)
                                for _, shard := range shardedRefSeries {
                                        for k, v := range shard {
                                                refSeries[k] = v
                                        }
                                }
                        }

                        if !h.opts.EnableExemplarStorage || h.opts.MaxExemplars.Load() <= 0 {
                                // Exemplar storage is disabled.
                                continue Outer
                        }

                        decbuf := encoding.Decbuf{B: rec[1:]}

                        exemplarBuf = exemplarBuf[:0]
                        exemplarBuf, err = dec.ExemplarsFromBuffer(&decbuf, exemplarBuf)
                        if err != nil {
                                loopErr = fmt.Errorf("exemplars from buffer: %w", err)
                                break Outer
                        }

                        for _, e := range exemplarBuf {
                                ms, ok := refSeries[e.Ref]
                                if !ok {
                                        unknownRefs++
                                        continue
                                }

                                if err := h.exemplars.AddExemplar(ms.lset, exemplar.Exemplar{
                                        Labels: e.Labels,
                                        Value:  e.V,
                                        Ts:     e.T,
                                }); err != nil {
                                        loopErr = fmt.Errorf("add exemplar: %w", err)
                                        break Outer
                                }
                        }

                default:
                        // This is a record type we don't understand. It is either an old format from earlier versions,
                        // or a new format and the code was rolled back to old version.
                        loopErr = fmt.Errorf("unsupported snapshot record type 0b%b", rec[0])
                        break Outer
                }
        }
        if len(refSeries) == 0 {
                close(recordChan)
                wg.Wait()
        }

        close(errChan)
        merr := tsdb_errors.NewMulti()
        if loopErr != nil {
                merr.Add(fmt.Errorf("decode loop: %w", loopErr))
        }
        for err := range errChan {
                merr.Add(fmt.Errorf("record processing: %w", err))
        }
        if err := merr.Err(); err != nil {
                return -1, -1, nil, err
        }

        if err := r.Err(); err != nil {
                return -1, -1, nil, fmt.Errorf("read records: %w", err)
        }

        if len(refSeries) == 0 {
                // We had no exemplar record, so we have to build the map here.
                refSeries = make(map[chunks.HeadSeriesRef]*memSeries, numSeries)
                for _, shard := range shardedRefSeries {
                        for k, v := range shard {
                                refSeries[k] = v
                        }
                }
        }

        elapsed := time.Since(start)
        level.Info(h.logger).Log("msg", "chunk snapshot loaded", "dir", dir, "num_series", numSeries, "duration", elapsed.String())
        if unknownRefs > 0 {
                level.Warn(h.logger).Log("msg", "unknown series references during chunk snapshot replay", "count", unknownRefs)
        }

        return snapIdx, snapOffset, refSeries, nil
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package index

import (
        "bufio"
        "bytes"
        "context"
        "encoding/binary"
        "fmt"
        "hash"
        "hash/crc32"
        "io"
        "math"
        "os"
        "path/filepath"
        "slices"
        "sort"
        "unsafe"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunks"
        "github.com/prometheus/prometheus/tsdb/encoding"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
)

const (
        // MagicIndex 4 bytes at the head of an index file.
        MagicIndex = 0xBAAAD700
        // HeaderLen represents number of bytes reserved of index for header.
        HeaderLen = 5

        // FormatV1 represents 1 version of index.
        FormatV1 = 1
        // FormatV2 represents 2 version of index.
        FormatV2 = 2

        indexFilename = "index"

        seriesByteAlign = 16

        // checkContextEveryNIterations is used in some tight loops to check if the context is done.
        checkContextEveryNIterations = 128
)

type indexWriterSeries struct {
        labels labels.Labels
        chunks []chunks.Meta // series file offset of chunks
}

type indexWriterSeriesSlice []*indexWriterSeries

func (s indexWriterSeriesSlice) Len() int      { return len(s) }
func (s indexWriterSeriesSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] }

func (s indexWriterSeriesSlice) Less(i, j int) bool {
        return labels.Compare(s[i].labels, s[j].labels) < 0
}

type indexWriterStage uint8

const (
        idxStageNone indexWriterStage = iota
        idxStageSymbols
        idxStageSeries
        idxStageDone
)

func (s indexWriterStage) String() string {
        switch s {
        case idxStageNone:
                return "none"
        case idxStageSymbols:
                return "symbols"
        case idxStageSeries:
                return "series"
        case idxStageDone:
                return "done"
        }
        return "<unknown>"
}

// The table gets initialized with sync.Once but may still cause a race
// with any other use of the crc32 package anywhere. Thus we initialize it
// before.
var castagnoliTable *crc32.Table

func init() {
        castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
}

// newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
// polynomial may be easily changed in one location at a later time, if necessary.
func newCRC32() hash.Hash32 {
        return crc32.New(castagnoliTable)
}

type symbolCacheEntry struct {
        index          uint32
        lastValueIndex uint32
        lastValue      string
}

type PostingsEncoder func(*encoding.Encbuf, []uint32) error

// Writer implements the IndexWriter interface for the standard
// serialization format.
type Writer struct {
        ctx context.Context

        // For the main index file.
        f *FileWriter

        // Temporary file for postings.
        fP *FileWriter
        // Temporary file for posting offsets table.
        fPO   *FileWriter
        cntPO uint64

        toc           TOC
        stage         indexWriterStage
        postingsStart uint64 // Due to padding, can differ from TOC entry.

        // Reusable memory.
        buf1 encoding.Encbuf
        buf2 encoding.Encbuf

        numSymbols  int
        symbols     *Symbols
        symbolFile  *fileutil.MmapFile
        lastSymbol  string
        symbolCache map[string]symbolCacheEntry

        labelIndexes []labelIndexHashEntry // Label index offsets.
        labelNames   map[string]uint64     // Label names, and their usage.

        // Hold last series to validate that clients insert new series in order.
        lastSeries    labels.Labels
        lastSeriesRef storage.SeriesRef

        // Hold last added chunk reference to make sure that chunks are ordered properly.
        lastChunkRef chunks.ChunkRef

        crc32 hash.Hash

        Version int

        postingsEncoder PostingsEncoder
}

// TOC represents the index Table Of Contents that states where each section of the index starts.
type TOC struct {
        Symbols           uint64
        Series            uint64
        LabelIndices      uint64
        LabelIndicesTable uint64
        Postings          uint64
        PostingsTable     uint64
}

// NewTOCFromByteSlice returns a parsed TOC from the given index byte slice.
func NewTOCFromByteSlice(bs ByteSlice) (*TOC, error) {
        if bs.Len() < indexTOCLen {
                return nil, encoding.ErrInvalidSize
        }
        b := bs.Range(bs.Len()-indexTOCLen, bs.Len())

        expCRC := binary.BigEndian.Uint32(b[len(b)-4:])
        d := encoding.Decbuf{B: b[:len(b)-4]}

        if d.Crc32(castagnoliTable) != expCRC {
                return nil, fmt.Errorf("read TOC: %w", encoding.ErrInvalidChecksum)
        }

        toc := &TOC{
                Symbols:           d.Be64(),
                Series:            d.Be64(),
                LabelIndices:      d.Be64(),
                LabelIndicesTable: d.Be64(),
                Postings:          d.Be64(),
                PostingsTable:     d.Be64(),
        }
        return toc, d.Err()
}

// NewWriter returns a new Writer to the given filename. It serializes data in format version 2.
// It uses the given encoder to encode each postings list.
func NewWriterWithEncoder(ctx context.Context, fn string, encoder PostingsEncoder) (*Writer, error) {
        dir := filepath.Dir(fn)

        df, err := fileutil.OpenDir(dir)
        if err != nil {
                return nil, err
        }
        defer df.Close() // Close for platform windows.

        if err := os.RemoveAll(fn); err != nil {
                return nil, fmt.Errorf("remove any existing index at path: %w", err)
        }

        // Main index file we are building.
        f, err := NewFileWriter(fn)
        if err != nil {
                return nil, err
        }
        // Temporary file for postings.
        fP, err := NewFileWriter(fn + "_tmp_p")
        if err != nil {
                return nil, err
        }
        // Temporary file for posting offset table.
        fPO, err := NewFileWriter(fn + "_tmp_po")
        if err != nil {
                return nil, err
        }
        if err := df.Sync(); err != nil {
                return nil, fmt.Errorf("sync dir: %w", err)
        }

        iw := &Writer{
                ctx:   ctx,
                f:     f,
                fP:    fP,
                fPO:   fPO,
                stage: idxStageNone,

                // Reusable memory.
                buf1: encoding.Encbuf{B: make([]byte, 0, 1<<22)},
                buf2: encoding.Encbuf{B: make([]byte, 0, 1<<22)},

                symbolCache:     make(map[string]symbolCacheEntry, 1<<8),
                labelNames:      make(map[string]uint64, 1<<8),
                crc32:           newCRC32(),
                postingsEncoder: encoder,
        }
        if err := iw.writeMeta(); err != nil {
                return nil, err
        }
        return iw, nil
}

// NewWriter creates a new index writer using the default encoder. See
// NewWriterWithEncoder.
func NewWriter(ctx context.Context, fn string) (*Writer, error) {
        return NewWriterWithEncoder(ctx, fn, EncodePostingsRaw)
}

func (w *Writer) write(bufs ...[]byte) error {
        return w.f.Write(bufs...)
}

func (w *Writer) writeAt(buf []byte, pos uint64) error {
        return w.f.WriteAt(buf, pos)
}

func (w *Writer) addPadding(size int) error {
        return w.f.AddPadding(size)
}

type FileWriter struct {
        f    *os.File
        fbuf *bufio.Writer
        pos  uint64
        name string
}

func NewFileWriter(name string) (*FileWriter, error) {
        f, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0o666)
        if err != nil {
                return nil, err
        }
        return &FileWriter{
                f:    f,
                fbuf: bufio.NewWriterSize(f, 1<<22),
                pos:  0,
                name: name,
        }, nil
}

func (fw *FileWriter) Pos() uint64 {
        return fw.pos
}

func (fw *FileWriter) Write(bufs ...[]byte) error {
        for _, b := range bufs {
                n, err := fw.fbuf.Write(b)
                fw.pos += uint64(n)
                if err != nil {
                        return err
                }
                // For now the index file must not grow beyond 64GiB. Some of the fixed-sized
                // offset references in v1 are only 4 bytes large.
                // Once we move to compressed/varint representations in those areas, this limitation
                // can be lifted.
                if fw.pos > 16*math.MaxUint32 {
                        return fmt.Errorf("%q exceeding max size of 64GiB", fw.name)
                }
        }
        return nil
}

func (fw *FileWriter) Flush() error {
        return fw.fbuf.Flush()
}

func (fw *FileWriter) WriteAt(buf []byte, pos uint64) error {
        if err := fw.Flush(); err != nil {
                return err
        }
        _, err := fw.f.WriteAt(buf, int64(pos))
        return err
}

// AddPadding adds zero byte padding until the file size is a multiple size.
func (fw *FileWriter) AddPadding(size int) error {
        p := fw.pos % uint64(size)
        if p == 0 {
                return nil
        }
        p = uint64(size) - p

        if err := fw.Write(make([]byte, p)); err != nil {
                return fmt.Errorf("add padding: %w", err)
        }
        return nil
}

func (fw *FileWriter) Close() error {
        if err := fw.Flush(); err != nil {
                return err
        }
        if err := fw.f.Sync(); err != nil {
                return err
        }
        return fw.f.Close()
}

func (fw *FileWriter) Remove() error {
        return os.Remove(fw.name)
}

// ensureStage handles transitions between write stages and ensures that IndexWriter
// methods are called in an order valid for the implementation.
func (w *Writer) ensureStage(s indexWriterStage) error {
        select {
        case <-w.ctx.Done():
                return w.ctx.Err()
        default:
        }

        if w.stage == s {
                return nil
        }
        if w.stage < s-1 {
                // A stage has been skipped.
                if err := w.ensureStage(s - 1); err != nil {
                        return err
                }
        }
        if w.stage > s {
                return fmt.Errorf("invalid stage %q, currently at %q", s, w.stage)
        }

        // Mark start of sections in table of contents.
        switch s {
        case idxStageSymbols:
                w.toc.Symbols = w.f.pos
                if err := w.startSymbols(); err != nil {
                        return err
                }
        case idxStageSeries:
                if err := w.finishSymbols(); err != nil {
                        return err
                }
                w.toc.Series = w.f.pos

        case idxStageDone:
                w.toc.LabelIndices = w.f.pos
                // LabelIndices generation depends on the posting offset
                // table produced at this stage.
                if err := w.writePostingsToTmpFiles(); err != nil {
                        return err
                }
                if err := w.writeLabelIndices(); err != nil {
                        return err
                }

                w.toc.Postings = w.f.pos
                if err := w.writePostings(); err != nil {
                        return err
                }

                w.toc.LabelIndicesTable = w.f.pos
                if err := w.writeLabelIndexesOffsetTable(); err != nil {
                        return err
                }

                w.toc.PostingsTable = w.f.pos
                if err := w.writePostingsOffsetTable(); err != nil {
                        return err
                }
                if err := w.writeTOC(); err != nil {
                        return err
                }
        }

        w.stage = s
        return nil
}

func (w *Writer) writeMeta() error {
        w.buf1.Reset()
        w.buf1.PutBE32(MagicIndex)
        w.buf1.PutByte(FormatV2)

        return w.write(w.buf1.Get())
}

// AddSeries adds the series one at a time along with its chunks.
func (w *Writer) AddSeries(ref storage.SeriesRef, lset labels.Labels, chunks ...chunks.Meta) error {
        if err := w.ensureStage(idxStageSeries); err != nil {
                return err
        }
        if labels.Compare(lset, w.lastSeries) <= 0 {
                return fmt.Errorf("out-of-order series added with label set %q", lset)
        }

        if ref < w.lastSeriesRef && !w.lastSeries.IsEmpty() {
                return fmt.Errorf("series with reference greater than %d already added", ref)
        }

        lastChunkRef := w.lastChunkRef
        lastMaxT := int64(0)
        for ix, c := range chunks {
                if c.Ref < lastChunkRef {
                        return fmt.Errorf("unsorted chunk reference: %d, previous: %d", c.Ref, lastChunkRef)
                }
                lastChunkRef = c.Ref

                if ix > 0 && c.MinTime <= lastMaxT {
                        return fmt.Errorf("chunk minT %d is not higher than previous chunk maxT %d", c.MinTime, lastMaxT)
                }
                if c.MaxTime < c.MinTime {
                        return fmt.Errorf("chunk maxT %d is less than minT %d", c.MaxTime, c.MinTime)
                }
                lastMaxT = c.MaxTime
        }

        // We add padding to 16 bytes to increase the addressable space we get through 4 byte
        // series references.
        if err := w.addPadding(seriesByteAlign); err != nil {
                return fmt.Errorf("failed to write padding bytes: %w", err)
        }

        if w.f.pos%seriesByteAlign != 0 {
                return fmt.Errorf("series write not 16-byte aligned at %d", w.f.pos)
        }

        w.buf2.Reset()
        w.buf2.PutUvarint(lset.Len())

        if err := lset.Validate(func(l labels.Label) error {
                var err error
                cacheEntry, ok := w.symbolCache[l.Name]
                nameIndex := cacheEntry.index
                if !ok {
                        nameIndex, err = w.symbols.ReverseLookup(l.Name)
                        if err != nil {
                                return fmt.Errorf("symbol entry for %q does not exist, %w", l.Name, err)
                        }
                }
                w.labelNames[l.Name]++
                w.buf2.PutUvarint32(nameIndex)

                valueIndex := cacheEntry.lastValueIndex
                if !ok || cacheEntry.lastValue != l.Value {
                        valueIndex, err = w.symbols.ReverseLookup(l.Value)
                        if err != nil {
                                return fmt.Errorf("symbol entry for %q does not exist, %w", l.Value, err)
                        }
                        w.symbolCache[l.Name] = symbolCacheEntry{
                                index:          nameIndex,
                                lastValueIndex: valueIndex,
                                lastValue:      l.Value,
                        }
                }
                w.buf2.PutUvarint32(valueIndex)
                return nil
        }); err != nil {
                return err
        }

        w.buf2.PutUvarint(len(chunks))

        if len(chunks) > 0 {
                c := chunks[0]
                w.buf2.PutVarint64(c.MinTime)
                w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime))
                w.buf2.PutUvarint64(uint64(c.Ref))
                t0 := c.MaxTime
                ref0 := int64(c.Ref)

                for _, c := range chunks[1:] {
                        w.buf2.PutUvarint64(uint64(c.MinTime - t0))
                        w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime))
                        t0 = c.MaxTime

                        w.buf2.PutVarint64(int64(c.Ref) - ref0)
                        ref0 = int64(c.Ref)
                }
        }

        w.buf1.Reset()
        w.buf1.PutUvarint(w.buf2.Len())

        w.buf2.PutHash(w.crc32)

        if err := w.write(w.buf1.Get(), w.buf2.Get()); err != nil {
                return fmt.Errorf("write series data: %w", err)
        }

        w.lastSeries.CopyFrom(lset)
        w.lastSeriesRef = ref
        w.lastChunkRef = lastChunkRef

        return nil
}

func (w *Writer) startSymbols() error {
        // We are at w.toc.Symbols.
        // Leave 4 bytes of space for the length, and another 4 for the number of symbols
        // which will both be calculated later.
        return w.write([]byte("alenblen"))
}

func (w *Writer) AddSymbol(sym string) error {
        if err := w.ensureStage(idxStageSymbols); err != nil {
                return err
        }
        if w.numSymbols != 0 && sym <= w.lastSymbol {
                return fmt.Errorf("symbol %q out-of-order", sym)
        }
        w.lastSymbol = sym
        w.numSymbols++
        w.buf1.Reset()
        w.buf1.PutUvarintStr(sym)
        return w.write(w.buf1.Get())
}

func (w *Writer) finishSymbols() error {
        symbolTableSize := w.f.pos - w.toc.Symbols - 4
        // The symbol table's <len> part is 4 bytes. So the total symbol table size must be less than or equal to 2^32-1
        if symbolTableSize > math.MaxUint32 {
                return fmt.Errorf("symbol table size exceeds %d bytes: %d", uint32(math.MaxUint32), symbolTableSize)
        }

        // Write out the length and symbol count.
        w.buf1.Reset()
        w.buf1.PutBE32int(int(symbolTableSize))
        w.buf1.PutBE32int(w.numSymbols)
        if err := w.writeAt(w.buf1.Get(), w.toc.Symbols); err != nil {
                return err
        }

        hashPos := w.f.pos
        // Leave space for the hash. We can only calculate it
        // now that the number of symbols is known, so mmap and do it from there.
        if err := w.write([]byte("hash")); err != nil {
                return err
        }
        if err := w.f.Flush(); err != nil {
                return err
        }

        sf, err := fileutil.OpenMmapFile(w.f.name)
        if err != nil {
                return err
        }
        w.symbolFile = sf
        hash := crc32.Checksum(w.symbolFile.Bytes()[w.toc.Symbols+4:hashPos], castagnoliTable)
        w.buf1.Reset()
        w.buf1.PutBE32(hash)
        if err := w.writeAt(w.buf1.Get(), hashPos); err != nil {
                return err
        }

        // Load in the symbol table efficiently for the rest of the index writing.
        w.symbols, err = NewSymbols(realByteSlice(w.symbolFile.Bytes()), FormatV2, int(w.toc.Symbols))
        if err != nil {
                return fmt.Errorf("read symbols: %w", err)
        }
        return nil
}

func (w *Writer) writeLabelIndices() error {
        if err := w.fPO.Flush(); err != nil {
                return err
        }

        // Find all the label values in the tmp posting offset table.
        f, err := fileutil.OpenMmapFile(w.fPO.name)
        if err != nil {
                return err
        }
        defer f.Close()

        d := encoding.NewDecbufRaw(realByteSlice(f.Bytes()), int(w.fPO.pos))
        cnt := w.cntPO
        current := []byte{}
        values := []uint32{}
        for d.Err() == nil && cnt > 0 {
                cnt--
                d.Uvarint()                           // Keycount.
                name := d.UvarintBytes()              // Label name.
                value := yoloString(d.UvarintBytes()) // Label value.
                d.Uvarint64()                         // Offset.
                if len(name) == 0 {
                        continue // All index is ignored.
                }

                if !bytes.Equal(name, current) && len(values) > 0 {
                        // We've reached a new label name.
                        if err := w.writeLabelIndex(string(current), values); err != nil {
                                return err
                        }
                        values = values[:0]
                }
                current = name
                sid, err := w.symbols.ReverseLookup(value)
                if err != nil {
                        return err
                }
                values = append(values, sid)
        }
        if d.Err() != nil {
                return d.Err()
        }

        // Handle the last label.
        if len(values) > 0 {
                if err := w.writeLabelIndex(string(current), values); err != nil {
                        return err
                }
        }
        return nil
}

func (w *Writer) writeLabelIndex(name string, values []uint32) error {
        // Align beginning to 4 bytes for more efficient index list scans.
        if err := w.addPadding(4); err != nil {
                return err
        }

        w.labelIndexes = append(w.labelIndexes, labelIndexHashEntry{
                keys:   []string{name},
                offset: w.f.pos,
        })

        startPos := w.f.pos
        // Leave 4 bytes of space for the length, which will be calculated later.
        if err := w.write([]byte("alen")); err != nil {
                return err
        }
        w.crc32.Reset()

        w.buf1.Reset()
        w.buf1.PutBE32int(1) // Number of names.
        w.buf1.PutBE32int(len(values))
        w.buf1.WriteToHash(w.crc32)
        if err := w.write(w.buf1.Get()); err != nil {
                return err
        }

        for _, v := range values {
                w.buf1.Reset()
                w.buf1.PutBE32(v)
                w.buf1.WriteToHash(w.crc32)
                if err := w.write(w.buf1.Get()); err != nil {
                        return err
                }
        }

        // Write out the length.
        w.buf1.Reset()
        l := w.f.pos - startPos - 4
        if l > math.MaxUint32 {
                return fmt.Errorf("label index size exceeds 4 bytes: %d", l)
        }
        w.buf1.PutBE32int(int(l))
        if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
                return err
        }

        w.buf1.Reset()
        w.buf1.PutHashSum(w.crc32)
        return w.write(w.buf1.Get())
}

// writeLabelIndexesOffsetTable writes the label indices offset table.
func (w *Writer) writeLabelIndexesOffsetTable() error {
        startPos := w.f.pos
        // Leave 4 bytes of space for the length, which will be calculated later.
        if err := w.write([]byte("alen")); err != nil {
                return err
        }
        w.crc32.Reset()

        w.buf1.Reset()
        w.buf1.PutBE32int(len(w.labelIndexes))
        w.buf1.WriteToHash(w.crc32)
        if err := w.write(w.buf1.Get()); err != nil {
                return err
        }

        for _, e := range w.labelIndexes {
                w.buf1.Reset()
                w.buf1.PutUvarint(len(e.keys))
                for _, k := range e.keys {
                        w.buf1.PutUvarintStr(k)
                }
                w.buf1.PutUvarint64(e.offset)
                w.buf1.WriteToHash(w.crc32)
                if err := w.write(w.buf1.Get()); err != nil {
                        return err
                }
        }

        // Write out the length.
        err := w.writeLengthAndHash(startPos)
        if err != nil {
                return fmt.Errorf("label indexes offset table length/crc32 write error: %w", err)
        }
        return nil
}

// writePostingsOffsetTable writes the postings offset table.
func (w *Writer) writePostingsOffsetTable() error {
        // Ensure everything is in the temporary file.
        if err := w.fPO.Flush(); err != nil {
                return err
        }

        startPos := w.f.pos
        // Leave 4 bytes of space for the length, which will be calculated later.
        if err := w.write([]byte("alen")); err != nil {
                return err
        }

        // Copy over the tmp posting offset table, however we need to
        // adjust the offsets.
        adjustment := w.postingsStart

        w.buf1.Reset()
        w.crc32.Reset()
        w.buf1.PutBE32int(int(w.cntPO)) // Count.
        w.buf1.WriteToHash(w.crc32)
        if err := w.write(w.buf1.Get()); err != nil {
                return err
        }

        f, err := fileutil.OpenMmapFile(w.fPO.name)
        if err != nil {
                return err
        }
        defer func() {
                if f != nil {
                        f.Close()
                }
        }()
        d := encoding.NewDecbufRaw(realByteSlice(f.Bytes()), int(w.fPO.pos))
        cnt := w.cntPO
        for d.Err() == nil && cnt > 0 {
                w.buf1.Reset()
                w.buf1.PutUvarint(d.Uvarint())                     // Keycount.
                w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label name.
                w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label value.
                w.buf1.PutUvarint64(d.Uvarint64() + adjustment)    // Offset.
                w.buf1.WriteToHash(w.crc32)
                if err := w.write(w.buf1.Get()); err != nil {
                        return err
                }
                cnt--
        }
        if d.Err() != nil {
                return d.Err()
        }

        // Cleanup temporary file.
        if err := f.Close(); err != nil {
                return err
        }
        f = nil
        if err := w.fPO.Close(); err != nil {
                return err
        }
        if err := w.fPO.Remove(); err != nil {
                return err
        }
        w.fPO = nil

        err = w.writeLengthAndHash(startPos)
        if err != nil {
                return fmt.Errorf("postings offset table length/crc32 write error: %w", err)
        }
        return nil
}

func (w *Writer) writeLengthAndHash(startPos uint64) error {
        w.buf1.Reset()
        l := w.f.pos - startPos - 4
        if l > math.MaxUint32 {
                return fmt.Errorf("length size exceeds 4 bytes: %d", l)
        }
        w.buf1.PutBE32int(int(l))
        if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
                return fmt.Errorf("write length from buffer error: %w", err)
        }

        // Write out the hash.
        w.buf1.Reset()
        w.buf1.PutHashSum(w.crc32)
        if err := w.write(w.buf1.Get()); err != nil {
                return fmt.Errorf("write buffer's crc32 error: %w", err)
        }
        return nil
}

const indexTOCLen = 6*8 + crc32.Size

func (w *Writer) writeTOC() error {
        w.buf1.Reset()

        w.buf1.PutBE64(w.toc.Symbols)
        w.buf1.PutBE64(w.toc.Series)
        w.buf1.PutBE64(w.toc.LabelIndices)
        w.buf1.PutBE64(w.toc.LabelIndicesTable)
        w.buf1.PutBE64(w.toc.Postings)
        w.buf1.PutBE64(w.toc.PostingsTable)

        w.buf1.PutHash(w.crc32)

        return w.write(w.buf1.Get())
}

func (w *Writer) writePostingsToTmpFiles() error {
        names := make([]string, 0, len(w.labelNames))
        for n := range w.labelNames {
                names = append(names, n)
        }
        slices.Sort(names)

        if err := w.f.Flush(); err != nil {
                return err
        }
        f, err := fileutil.OpenMmapFile(w.f.name)
        if err != nil {
                return err
        }
        defer f.Close()

        // Write out the special all posting.
        offsets := []uint32{}
        d := encoding.NewDecbufRaw(realByteSlice(f.Bytes()), int(w.toc.LabelIndices))
        d.Skip(int(w.toc.Series))
        for d.Len() > 0 {
                d.ConsumePadding()
                startPos := w.toc.LabelIndices - uint64(d.Len())
                if startPos%seriesByteAlign != 0 {
                        return fmt.Errorf("series not 16-byte aligned at %d", startPos)
                }
                offsets = append(offsets, uint32(startPos/seriesByteAlign))
                // Skip to next series.
                x := d.Uvarint()
                d.Skip(x + crc32.Size)
                if err := d.Err(); err != nil {
                        return err
                }
        }
        if err := w.writePosting("", "", offsets); err != nil {
                return err
        }
        maxPostings := uint64(len(offsets)) // No label name can have more postings than this.

        for len(names) > 0 {
                batchNames := []string{}
                var c uint64
                // Try to bunch up label names into one loop, but avoid
                // using more memory than a single label name can.
                for len(names) > 0 {
                        if w.labelNames[names[0]]+c > maxPostings {
                                if c > 0 {
                                        break
                                }
                                return fmt.Errorf("corruption detected when writing postings to index: label %q has %d uses, but maxPostings is %d", names[0], w.labelNames[names[0]], maxPostings)
                        }
                        batchNames = append(batchNames, names[0])
                        c += w.labelNames[names[0]]
                        names = names[1:]
                }

                nameSymbols := map[uint32]string{}
                for _, name := range batchNames {
                        sid, err := w.symbols.ReverseLookup(name)
                        if err != nil {
                                return err
                        }
                        nameSymbols[sid] = name
                }
                // Label name -> label value -> positions.
                postings := map[uint32]map[uint32][]uint32{}

                d := encoding.NewDecbufRaw(realByteSlice(f.Bytes()), int(w.toc.LabelIndices))
                d.Skip(int(w.toc.Series))
                for d.Len() > 0 {
                        d.ConsumePadding()
                        startPos := w.toc.LabelIndices - uint64(d.Len())
                        l := d.Uvarint() // Length of this series in bytes.
                        startLen := d.Len()

                        // See if label names we want are in the series.
                        numLabels := d.Uvarint()
                        for i := 0; i < numLabels; i++ {
                                lno := uint32(d.Uvarint())
                                lvo := uint32(d.Uvarint())

                                if _, ok := nameSymbols[lno]; ok {
                                        if _, ok := postings[lno]; !ok {
                                                postings[lno] = map[uint32][]uint32{}
                                        }
                                        postings[lno][lvo] = append(postings[lno][lvo], uint32(startPos/seriesByteAlign))
                                }
                        }
                        // Skip to next series.
                        d.Skip(l - (startLen - d.Len()) + crc32.Size)
                        if err := d.Err(); err != nil {
                                return err
                        }
                }

                for _, name := range batchNames {
                        // Write out postings for this label name.
                        sid, err := w.symbols.ReverseLookup(name)
                        if err != nil {
                                return err
                        }
                        values := make([]uint32, 0, len(postings[sid]))
                        for v := range postings[sid] {
                                values = append(values, v)
                        }
                        // Symbol numbers are in order, so the strings will also be in order.
                        slices.Sort(values)
                        for _, v := range values {
                                value, err := w.symbols.Lookup(v)
                                if err != nil {
                                        return err
                                }
                                if err := w.writePosting(name, value, postings[sid][v]); err != nil {
                                        return err
                                }
                        }
                }
                select {
                case <-w.ctx.Done():
                        return w.ctx.Err()
                default:
                }
        }
        return nil
}

// EncodePostingsRaw uses the "basic" postings list encoding format with no compression:
// <BE uint32 len X><BE uint32 0><BE uint32 1>...<BE uint32 X-1>.
func EncodePostingsRaw(e *encoding.Encbuf, offs []uint32) error {
        e.PutBE32int(len(offs))

        for _, off := range offs {
                if off > (1<<32)-1 {
                        return fmt.Errorf("series offset %d exceeds 4 bytes", off)
                }
                e.PutBE32(off)
        }
        return nil
}

func (w *Writer) writePosting(name, value string, offs []uint32) error {
        // Align beginning to 4 bytes for more efficient postings list scans.
        if err := w.fP.AddPadding(4); err != nil {
                return err
        }

        // Write out postings offset table to temporary file as we go.
        w.buf1.Reset()
        w.buf1.PutUvarint(2)
        w.buf1.PutUvarintStr(name)
        w.buf1.PutUvarintStr(value)
        w.buf1.PutUvarint64(w.fP.pos) // This is relative to the postings tmp file, not the final index file.
        if err := w.fPO.Write(w.buf1.Get()); err != nil {
                return err
        }
        w.cntPO++

        w.buf1.Reset()
        if err := w.postingsEncoder(&w.buf1, offs); err != nil {
                return err
        }

        w.buf2.Reset()
        l := w.buf1.Len()
        // We convert to uint to make code compile on 32-bit systems, as math.MaxUint32 doesn't fit into int there.
        if uint(l) > math.MaxUint32 {
                return fmt.Errorf("posting size exceeds 4 bytes: %d", l)
        }
        w.buf2.PutBE32int(l)
        w.buf1.PutHash(w.crc32)
        return w.fP.Write(w.buf2.Get(), w.buf1.Get())
}

func (w *Writer) writePostings() error {
        // There's padding in the tmp file, make sure it actually works.
        if err := w.f.AddPadding(4); err != nil {
                return err
        }
        w.postingsStart = w.f.pos

        // Copy temporary file into main index.
        if err := w.fP.Flush(); err != nil {
                return err
        }
        if _, err := w.fP.f.Seek(0, 0); err != nil {
                return err
        }
        // Don't need to calculate a checksum, so can copy directly.
        n, err := io.CopyBuffer(w.f.fbuf, w.fP.f, make([]byte, 1<<20))
        if err != nil {
                return err
        }
        if uint64(n) != w.fP.pos {
                return fmt.Errorf("wrote %d bytes to posting temporary file, but only read back %d", w.fP.pos, n)
        }
        w.f.pos += uint64(n)

        if err := w.fP.Close(); err != nil {
                return err
        }
        if err := w.fP.Remove(); err != nil {
                return err
        }
        w.fP = nil
        return nil
}

type labelIndexHashEntry struct {
        keys   []string
        offset uint64
}

func (w *Writer) Close() error {
        // Even if this fails, we need to close all the files.
        ensureErr := w.ensureStage(idxStageDone)

        if w.symbolFile != nil {
                if err := w.symbolFile.Close(); err != nil {
                        return err
                }
        }
        if w.fP != nil {
                if err := w.fP.Close(); err != nil {
                        return err
                }
        }
        if w.fPO != nil {
                if err := w.fPO.Close(); err != nil {
                        return err
                }
        }
        if err := w.f.Close(); err != nil {
                return err
        }
        return ensureErr
}

// StringIter iterates over a sorted list of strings.
type StringIter interface {
        // Next advances the iterator and returns true if another value was found.
        Next() bool

        // At returns the value at the current iterator position.
        At() string

        // Err returns the last error of the iterator.
        Err() error
}

type Reader struct {
        b   ByteSlice
        toc *TOC

        // Close that releases the underlying resources of the byte slice.
        c io.Closer

        // Map of LabelName to a list of some LabelValues's position in the offset table.
        // The first and last values for each name are always present.
        postings map[string][]postingOffset
        // For the v1 format, labelname -> labelvalue -> offset.
        postingsV1 map[string]map[string]uint64

        symbols     *Symbols
        nameSymbols map[uint32]string // Cache of the label name symbol lookups,
        // as there are not many and they are half of all lookups.
        st *labels.SymbolTable // TODO: see if we can merge this with nameSymbols.

        dec *Decoder

        version int
}

type postingOffset struct {
        value string
        off   int
}

// ByteSlice abstracts a byte slice.
type ByteSlice interface {
        Len() int
        Range(start, end int) []byte
}

type realByteSlice []byte

func (b realByteSlice) Len() int {
        return len(b)
}

func (b realByteSlice) Range(start, end int) []byte {
        return b[start:end]
}

func (b realByteSlice) Sub(start, end int) ByteSlice {
        return b[start:end]
}

// NewReader returns a new index reader on the given byte slice. It automatically
// handles different format versions.
func NewReader(b ByteSlice) (*Reader, error) {
        return newReader(b, io.NopCloser(nil))
}

// NewFileReader returns a new index reader against the given index file.
func NewFileReader(path string) (*Reader, error) {
        f, err := fileutil.OpenMmapFile(path)
        if err != nil {
                return nil, err
        }
        r, err := newReader(realByteSlice(f.Bytes()), f)
        if err != nil {
                return nil, tsdb_errors.NewMulti(
                        err,
                        f.Close(),
                ).Err()
        }

        return r, nil
}

func newReader(b ByteSlice, c io.Closer) (*Reader, error) {
        r := &Reader{
                b:        b,
                c:        c,
                postings: map[string][]postingOffset{},
                st:       labels.NewSymbolTable(),
        }

        // Verify header.
        if r.b.Len() < HeaderLen {
                return nil, fmt.Errorf("index header: %w", encoding.ErrInvalidSize)
        }
        if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex {
                return nil, fmt.Errorf("invalid magic number %x", m)
        }
        r.version = int(r.b.Range(4, 5)[0])

        if r.version != FormatV1 && r.version != FormatV2 {
                return nil, fmt.Errorf("unknown index file version %d", r.version)
        }

        var err error
        r.toc, err = NewTOCFromByteSlice(b)
        if err != nil {
                return nil, fmt.Errorf("read TOC: %w", err)
        }

        r.symbols, err = NewSymbols(r.b, r.version, int(r.toc.Symbols))
        if err != nil {
                return nil, fmt.Errorf("read symbols: %w", err)
        }

        if r.version == FormatV1 {
                // Earlier V1 formats don't have a sorted postings offset table, so
                // load the whole offset table into memory.
                r.postingsV1 = map[string]map[string]uint64{}
                if err := ReadPostingsOffsetTable(r.b, r.toc.PostingsTable, func(name, value []byte, off uint64, _ int) error {
                        if _, ok := r.postingsV1[string(name)]; !ok {
                                r.postingsV1[string(name)] = map[string]uint64{}
                                r.postings[string(name)] = nil // Used to get a list of labelnames in places.
                        }
                        r.postingsV1[string(name)][string(value)] = off
                        return nil
                }); err != nil {
                        return nil, fmt.Errorf("read postings table: %w", err)
                }
        } else {
                var lastName, lastValue []byte
                lastOff := 0
                valueCount := 0
                // For the postings offset table we keep every label name but only every nth
                // label value (plus the first and last one), to save memory.
                if err := ReadPostingsOffsetTable(r.b, r.toc.PostingsTable, func(name, value []byte, _ uint64, off int) error {
                        if _, ok := r.postings[string(name)]; !ok {
                                // Next label name.
                                r.postings[string(name)] = []postingOffset{}
                                if lastName != nil {
                                        // Always include last value for each label name.
                                        r.postings[string(lastName)] = append(r.postings[string(lastName)], postingOffset{value: string(lastValue), off: lastOff})
                                }
                                valueCount = 0
                        }
                        if valueCount%symbolFactor == 0 {
                                r.postings[string(name)] = append(r.postings[string(name)], postingOffset{value: string(value), off: off})
                                lastName, lastValue = nil, nil
                        } else {
                                lastName, lastValue = name, value
                                lastOff = off
                        }
                        valueCount++
                        return nil
                }); err != nil {
                        return nil, fmt.Errorf("read postings table: %w", err)
                }
                if lastName != nil {
                        r.postings[string(lastName)] = append(r.postings[string(lastName)], postingOffset{value: string(lastValue), off: lastOff})
                }
                // Trim any extra space in the slices.
                for k, v := range r.postings {
                        l := make([]postingOffset, len(v))
                        copy(l, v)
                        r.postings[k] = l
                }
        }

        r.nameSymbols = make(map[uint32]string, len(r.postings))
        for k := range r.postings {
                if k == "" {
                        continue
                }
                off, err := r.symbols.ReverseLookup(k)
                if err != nil {
                        return nil, fmt.Errorf("reverse symbol lookup: %w", err)
                }
                r.nameSymbols[off] = k
        }

        r.dec = &Decoder{LookupSymbol: r.lookupSymbol}

        return r, nil
}

// Version returns the file format version of the underlying index.
func (r *Reader) Version() int {
        return r.version
}

// Range marks a byte range.
type Range struct {
        Start, End int64
}

// PostingsRanges returns a new map of byte range in the underlying index file
// for all postings lists.
func (r *Reader) PostingsRanges() (map[labels.Label]Range, error) {
        m := map[labels.Label]Range{}
        if err := ReadPostingsOffsetTable(r.b, r.toc.PostingsTable, func(name, value []byte, off uint64, _ int) error {
                d := encoding.NewDecbufAt(r.b, int(off), castagnoliTable)
                if d.Err() != nil {
                        return d.Err()
                }
                m[labels.Label{Name: string(name), Value: string(value)}] = Range{
                        Start: int64(off) + 4,
                        End:   int64(off) + 4 + int64(d.Len()),
                }
                return nil
        }); err != nil {
                return nil, fmt.Errorf("read postings table: %w", err)
        }
        return m, nil
}

type Symbols struct {
        bs      ByteSlice
        version int
        off     int

        offsets []int
        seen    int
}

const symbolFactor = 32

// NewSymbols returns a Symbols object for symbol lookups.
func NewSymbols(bs ByteSlice, version, off int) (*Symbols, error) {
        s := &Symbols{
                bs:      bs,
                version: version,
                off:     off,
        }
        d := encoding.NewDecbufAt(bs, off, castagnoliTable)
        var (
                origLen = d.Len()
                cnt     = d.Be32int()
                basePos = off + 4
        )
        s.offsets = make([]int, 0, 1+cnt/symbolFactor)
        for d.Err() == nil && s.seen < cnt {
                if s.seen%symbolFactor == 0 {
                        s.offsets = append(s.offsets, basePos+origLen-d.Len())
                }
                d.UvarintBytes() // The symbol.
                s.seen++
        }
        if d.Err() != nil {
                return nil, d.Err()
        }
        return s, nil
}

func (s Symbols) Lookup(o uint32) (string, error) {
        d := encoding.Decbuf{
                B: s.bs.Range(0, s.bs.Len()),
        }

        if s.version == FormatV2 {
                if int(o) >= s.seen {
                        return "", fmt.Errorf("unknown symbol offset %d", o)
                }
                d.Skip(s.offsets[int(o/symbolFactor)])
                // Walk until we find the one we want.
                for i := o - (o / symbolFactor * symbolFactor); i > 0; i-- {
                        d.UvarintBytes()
                }
        } else {
                d.Skip(int(o))
        }
        sym := d.UvarintStr()
        if d.Err() != nil {
                return "", d.Err()
        }
        return sym, nil
}

func (s Symbols) ReverseLookup(sym string) (uint32, error) {
        if len(s.offsets) == 0 {
                return 0, fmt.Errorf("unknown symbol %q - no symbols", sym)
        }
        i := sort.Search(len(s.offsets), func(i int) bool {
                // Any decoding errors here will be lost, however
                // we already read through all of this at startup.
                d := encoding.Decbuf{
                        B: s.bs.Range(0, s.bs.Len()),
                }
                d.Skip(s.offsets[i])
                return yoloString(d.UvarintBytes()) > sym
        })
        d := encoding.Decbuf{
                B: s.bs.Range(0, s.bs.Len()),
        }
        if i > 0 {
                i--
        }
        d.Skip(s.offsets[i])
        res := i * symbolFactor
        var lastLen int
        var lastSymbol string
        for d.Err() == nil && res <= s.seen {
                lastLen = d.Len()
                lastSymbol = yoloString(d.UvarintBytes())
                if lastSymbol >= sym {
                        break
                }
                res++
        }
        if d.Err() != nil {
                return 0, d.Err()
        }
        if lastSymbol != sym {
                return 0, fmt.Errorf("unknown symbol %q", sym)
        }
        if s.version == FormatV2 {
                return uint32(res), nil
        }
        return uint32(s.bs.Len() - lastLen), nil
}

func (s Symbols) Size() int {
        return len(s.offsets) * 8
}

func (s Symbols) Iter() StringIter {
        d := encoding.NewDecbufAt(s.bs, s.off, castagnoliTable)
        cnt := d.Be32int()
        return &symbolsIter{
                d:   d,
                cnt: cnt,
        }
}

// symbolsIter implements StringIter.
type symbolsIter struct {
        d   encoding.Decbuf
        cnt int
        cur string
        err error
}

func (s *symbolsIter) Next() bool {
        if s.cnt == 0 || s.err != nil {
                return false
        }
        s.cur = yoloString(s.d.UvarintBytes())
        s.cnt--
        if s.d.Err() != nil {
                s.err = s.d.Err()
                return false
        }
        return true
}

func (s symbolsIter) At() string { return s.cur }
func (s symbolsIter) Err() error { return s.err }

// ReadPostingsOffsetTable reads the postings offset table and at the given position calls f for each
// found entry.
// The name and value parameters passed to f reuse the backing memory of the underlying byte slice,
// so they shouldn't be persisted without previously copying them.
// If f returns an error it stops decoding and returns the received error.
func ReadPostingsOffsetTable(bs ByteSlice, off uint64, f func(name, value []byte, postingsOffset uint64, labelOffset int) error) error {
        d := encoding.NewDecbufAt(bs, int(off), castagnoliTable)
        startLen := d.Len()
        cnt := d.Be32()

        for d.Err() == nil && d.Len() > 0 && cnt > 0 {
                offsetPos := startLen - d.Len()

                if keyCount := d.Uvarint(); keyCount != 2 {
                        return fmt.Errorf("unexpected number of keys for postings offset table %d", keyCount)
                }
                name := d.UvarintBytes()
                value := d.UvarintBytes()
                o := d.Uvarint64()
                if d.Err() != nil {
                        break
                }
                if err := f(name, value, o, offsetPos); err != nil {
                        return err
                }
                cnt--
        }
        return d.Err()
}

// Close the reader and its underlying resources.
func (r *Reader) Close() error {
        return r.c.Close()
}

func (r *Reader) lookupSymbol(ctx context.Context, o uint32) (string, error) {
        if s, ok := r.nameSymbols[o]; ok {
                return s, nil
        }
        return r.symbols.Lookup(o)
}

// Symbols returns an iterator over the symbols that exist within the index.
func (r *Reader) Symbols() StringIter {
        return r.symbols.Iter()
}

// SymbolTableSize returns the symbol table size in bytes.
func (r *Reader) SymbolTableSize() uint64 {
        return uint64(r.symbols.Size())
}

// SortedLabelValues returns value tuples that exist for the given label name.
// It is not safe to use the return value beyond the lifetime of the byte slice
// passed into the Reader.
func (r *Reader) SortedLabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
        values, err := r.LabelValues(ctx, name, matchers...)
        if err == nil && r.version == FormatV1 {
                slices.Sort(values)
        }
        return values, err
}

// LabelValues returns value tuples that exist for the given label name.
// It is not safe to use the return value beyond the lifetime of the byte slice
// passed into the Reader.
// TODO(replay): Support filtering by matchers.
func (r *Reader) LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
        if len(matchers) > 0 {
                return nil, fmt.Errorf("matchers parameter is not implemented: %+v", matchers)
        }

        if r.version == FormatV1 {
                e, ok := r.postingsV1[name]
                if !ok {
                        return nil, nil
                }
                values := make([]string, 0, len(e))
                for k := range e {
                        values = append(values, k)
                }
                return values, nil
        }
        e, ok := r.postings[name]
        if !ok {
                return nil, nil
        }
        if len(e) == 0 {
                return nil, nil
        }

        values := make([]string, 0, len(e)*symbolFactor)
        lastVal := e[len(e)-1].value
        err := r.traversePostingOffsets(ctx, e[0].off, func(val string, _ uint64) (bool, error) {
                values = append(values, val)
                return val != lastVal, nil
        })
        return values, err
}

// LabelNamesFor returns all the label names for the series referred to by IDs.
// The names returned are sorted.
func (r *Reader) LabelNamesFor(ctx context.Context, postings Postings) ([]string, error) {
        // Gather offsetsMap the name offsetsMap in the symbol table first
        offsetsMap := make(map[uint32]struct{})
        i := 0
        for postings.Next() {
                id := postings.At()
                i++

                if i%checkContextEveryNIterations == 0 {
                        if ctxErr := ctx.Err(); ctxErr != nil {
                                return nil, ctxErr
                        }
                }

                offset := id
                // In version 2 series IDs are no longer exact references but series are 16-byte padded
                // and the ID is the multiple of 16 of the actual position.
                if r.version == FormatV2 {
                        offset = id * seriesByteAlign
                }

                d := encoding.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)
                buf := d.Get()
                if d.Err() != nil {
                        return nil, fmt.Errorf("get buffer for series: %w", d.Err())
                }

                offsets, err := r.dec.LabelNamesOffsetsFor(buf)
                if err != nil {
                        return nil, fmt.Errorf("get label name offsets: %w", err)
                }
                for _, off := range offsets {
                        offsetsMap[off] = struct{}{}
                }
        }

        // Lookup the unique symbols.
        names := make([]string, 0, len(offsetsMap))
        for off := range offsetsMap {
                name, err := r.lookupSymbol(ctx, off)
                if err != nil {
                        return nil, fmt.Errorf("lookup symbol in LabelNamesFor: %w", err)
                }
                names = append(names, name)
        }

        slices.Sort(names)

        return names, nil
}

// LabelValueFor returns label value for the given label name in the series referred to by ID.
func (r *Reader) LabelValueFor(ctx context.Context, id storage.SeriesRef, label string) (string, error) {
        offset := id
        // In version 2 series IDs are no longer exact references but series are 16-byte padded
        // and the ID is the multiple of 16 of the actual position.
        if r.version == FormatV2 {
                offset = id * seriesByteAlign
        }
        d := encoding.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)
        buf := d.Get()
        if d.Err() != nil {
                return "", fmt.Errorf("label values for: %w", d.Err())
        }

        value, err := r.dec.LabelValueFor(ctx, buf, label)
        if err != nil {
                return "", storage.ErrNotFound
        }

        if value == "" {
                return "", storage.ErrNotFound
        }

        return value, nil
}

// Series reads the series with the given ID and writes its labels and chunks into builder and chks.
func (r *Reader) Series(id storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
        offset := id
        // In version 2 series IDs are no longer exact references but series are 16-byte padded
        // and the ID is the multiple of 16 of the actual position.
        if r.version == FormatV2 {
                offset = id * seriesByteAlign
        }
        d := encoding.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)
        if d.Err() != nil {
                return d.Err()
        }
        builder.SetSymbolTable(r.st)
        builder.Reset()
        err := r.dec.Series(d.Get(), builder, chks)
        if err != nil {
                return fmt.Errorf("read series: %w", err)
        }
        return nil
}

// traversePostingOffsets traverses r's posting offsets table, starting at off, and calls cb with every label value and postings offset.
// If cb returns false (or an error), the traversing is interrupted.
func (r *Reader) traversePostingOffsets(ctx context.Context, off int, cb func(string, uint64) (bool, error)) error {
        // Don't Crc32 the entire postings offset table, this is very slow
        // so hope any issues were caught at startup.
        d := encoding.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil)
        d.Skip(off)
        skip := 0
        ctxErr := ctx.Err()
        for d.Err() == nil && ctxErr == nil {
                if skip == 0 {
                        // These are always the same number of bytes,
                        // and it's faster to skip than to parse.
                        skip = d.Len()
                        d.Uvarint()      // Keycount.
                        d.UvarintBytes() // Label name.
                        skip -= d.Len()
                } else {
                        d.Skip(skip)
                }
                v := yoloString(d.UvarintBytes()) // Label value.
                postingsOff := d.Uvarint64()      // Offset.
                if ok, err := cb(v, postingsOff); err != nil {
                        return err
                } else if !ok {
                        break
                }
                ctxErr = ctx.Err()
        }
        if d.Err() != nil {
                return fmt.Errorf("get postings offset entry: %w", d.Err())
        }
        if ctxErr != nil {
                return fmt.Errorf("get postings offset entry: %w", ctxErr)
        }
        return nil
}

func (r *Reader) Postings(ctx context.Context, name string, values ...string) (Postings, error) {
        if r.version == FormatV1 {
                e, ok := r.postingsV1[name]
                if !ok {
                        return EmptyPostings(), nil
                }
                res := make([]Postings, 0, len(values))
                for _, v := range values {
                        postingsOff, ok := e[v]
                        if !ok {
                                continue
                        }
                        // Read from the postings table.
                        d := encoding.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)
                        _, p, err := r.dec.Postings(d.Get())
                        if err != nil {
                                return nil, fmt.Errorf("decode postings: %w", err)
                        }
                        res = append(res, p)
                }
                return Merge(ctx, res...), nil
        }

        e, ok := r.postings[name]
        if !ok {
                return EmptyPostings(), nil
        }

        if len(values) == 0 {
                return EmptyPostings(), nil
        }

        slices.Sort(values) // Values must be in order so we can step through the table on disk.
        res := make([]Postings, 0, len(values))
        valueIndex := 0
        for valueIndex < len(values) && values[valueIndex] < e[0].value {
                // Discard values before the start.
                valueIndex++
        }
        for valueIndex < len(values) {
                value := values[valueIndex]

                i := sort.Search(len(e), func(i int) bool { return e[i].value >= value })
                if i == len(e) {
                        // We're past the end.
                        break
                }
                if i > 0 && e[i].value != value {
                        // Need to look from previous entry.
                        i--
                }

                if err := r.traversePostingOffsets(ctx, e[i].off, func(val string, postingsOff uint64) (bool, error) {
                        for val >= value {
                                if val == value {
                                        // Read from the postings table.
                                        d2 := encoding.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)
                                        _, p, err := r.dec.Postings(d2.Get())
                                        if err != nil {
                                                return false, fmt.Errorf("decode postings: %w", err)
                                        }
                                        res = append(res, p)
                                }
                                valueIndex++
                                if valueIndex == len(values) {
                                        break
                                }
                                value = values[valueIndex]
                        }
                        if i+1 == len(e) || value >= e[i+1].value || valueIndex == len(values) {
                                // Need to go to a later postings offset entry, if there is one.
                                return false, nil
                        }
                        return true, nil
                }); err != nil {
                        return nil, err
                }
        }

        return Merge(ctx, res...), nil
}

func (r *Reader) PostingsForLabelMatching(ctx context.Context, name string, match func(string) bool) Postings {
        if r.version == FormatV1 {
                return r.postingsForLabelMatchingV1(ctx, name, match)
        }

        e := r.postings[name]
        if len(e) == 0 {
                return EmptyPostings()
        }

        lastVal := e[len(e)-1].value
        var its []Postings
        if err := r.traversePostingOffsets(ctx, e[0].off, func(val string, postingsOff uint64) (bool, error) {
                if match(val) {
                        // We want this postings iterator since the value is a match
                        postingsDec := encoding.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)
                        _, p, err := r.dec.PostingsFromDecbuf(postingsDec)
                        if err != nil {
                                return false, fmt.Errorf("decode postings: %w", err)
                        }
                        its = append(its, p)
                }
                return val != lastVal, nil
        }); err != nil {
                return ErrPostings(err)
        }

        return Merge(ctx, its...)
}

func (r *Reader) postingsForLabelMatchingV1(ctx context.Context, name string, match func(string) bool) Postings {
        e := r.postingsV1[name]
        if len(e) == 0 {
                return EmptyPostings()
        }

        var its []Postings
        count := 1
        for val, offset := range e {
                if count%checkContextEveryNIterations == 0 && ctx.Err() != nil {
                        return ErrPostings(ctx.Err())
                }
                count++
                if !match(val) {
                        continue
                }

                // Read from the postings table.
                d := encoding.NewDecbufAt(r.b, int(offset), castagnoliTable)
                _, p, err := r.dec.PostingsFromDecbuf(d)
                if err != nil {
                        return ErrPostings(fmt.Errorf("decode postings: %w", err))
                }

                its = append(its, p)
        }

        return Merge(ctx, its...)
}

// SortedPostings returns the given postings list reordered so that the backing series
// are sorted.
func (r *Reader) SortedPostings(p Postings) Postings {
        return p
}

// ShardedPostings returns a postings list filtered by the provided shardIndex out of shardCount.
func (r *Reader) ShardedPostings(p Postings, shardIndex, shardCount uint64) Postings {
        var (
                out     = make([]storage.SeriesRef, 0, 128)
                bufLbls = labels.ScratchBuilder{}
        )

        for p.Next() {
                id := p.At()

                // Get the series labels (no chunks).
                err := r.Series(id, &bufLbls, nil)
                if err != nil {
                        return ErrPostings(fmt.Errorf("series %d not found", id))
                }

                // Check if the series belong to the shard.
                if labels.StableHash(bufLbls.Labels())%shardCount != shardIndex {
                        continue
                }

                out = append(out, id)
        }

        return NewListPostings(out)
}

// Size returns the size of an index file.
func (r *Reader) Size() int64 {
        return int64(r.b.Len())
}

// LabelNames returns all the unique label names present in the index.
// TODO(twilkie) implement support for matchers.
func (r *Reader) LabelNames(_ context.Context, matchers ...*labels.Matcher) ([]string, error) {
        if len(matchers) > 0 {
                return nil, fmt.Errorf("matchers parameter is not implemented: %+v", matchers)
        }

        labelNames := make([]string, 0, len(r.postings))
        for name := range r.postings {
                if name == allPostingsKey.Name {
                        // This is not from any metric.
                        continue
                }
                labelNames = append(labelNames, name)
        }
        slices.Sort(labelNames)
        return labelNames, nil
}

// NewStringListIter returns a StringIter for the given sorted list of strings.
func NewStringListIter(s []string) StringIter {
        return &stringListIter{l: s}
}

// stringListIter implements StringIter.
type stringListIter struct {
        l   []string
        cur string
}

func (s *stringListIter) Next() bool {
        if len(s.l) == 0 {
                return false
        }
        s.cur = s.l[0]
        s.l = s.l[1:]
        return true
}
func (s stringListIter) At() string { return s.cur }
func (s stringListIter) Err() error { return nil }

// Decoder provides decoding methods for the v1 and v2 index file format.
//
// It currently does not contain decoding methods for all entry types but can be extended
// by them if there's demand.
type Decoder struct {
        LookupSymbol func(context.Context, uint32) (string, error)
}

// Postings returns a postings list for b and its number of elements.
func (dec *Decoder) Postings(b []byte) (int, Postings, error) {
        d := encoding.Decbuf{B: b}
        return dec.PostingsFromDecbuf(d)
}

// PostingsFromDecbuf returns a postings list for d and its number of elements.
func (dec *Decoder) PostingsFromDecbuf(d encoding.Decbuf) (int, Postings, error) {
        n := d.Be32int()
        l := d.Get()
        if d.Err() != nil {
                return 0, nil, d.Err()
        }
        if len(l) != 4*n {
                return 0, nil, fmt.Errorf("unexpected postings length, should be %d bytes for %d postings, got %d bytes", 4*n, n, len(l))
        }
        return n, newBigEndianPostings(l), nil
}

// LabelNamesOffsetsFor decodes the offsets of the name symbols for a given series.
// They are returned in the same order they're stored, which should be sorted lexicographically.
func (dec *Decoder) LabelNamesOffsetsFor(b []byte) ([]uint32, error) {
        d := encoding.Decbuf{B: b}
        k := d.Uvarint()

        offsets := make([]uint32, k)
        for i := 0; i < k; i++ {
                offsets[i] = uint32(d.Uvarint())
                _ = d.Uvarint() // skip the label value

                if d.Err() != nil {
                        return nil, fmt.Errorf("read series label offsets: %w", d.Err())
                }
        }

        return offsets, d.Err()
}

// LabelValueFor decodes a label for a given series.
func (dec *Decoder) LabelValueFor(ctx context.Context, b []byte, label string) (string, error) {
        d := encoding.Decbuf{B: b}
        k := d.Uvarint()

        for i := 0; i < k; i++ {
                lno := uint32(d.Uvarint())
                lvo := uint32(d.Uvarint())

                if d.Err() != nil {
                        return "", fmt.Errorf("read series label offsets: %w", d.Err())
                }

                ln, err := dec.LookupSymbol(ctx, lno)
                if err != nil {
                        return "", fmt.Errorf("lookup label name: %w", err)
                }

                if ln == label {
                        lv, err := dec.LookupSymbol(ctx, lvo)
                        if err != nil {
                                return "", fmt.Errorf("lookup label value: %w", err)
                        }

                        return lv, nil
                }
        }

        return "", d.Err()
}

// Series decodes a series entry from the given byte slice into builder and chks.
// Previous contents of builder can be overwritten - make sure you copy before retaining.
// Skips reading chunks metadata if chks is nil.
func (dec *Decoder) Series(b []byte, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
        builder.Reset()
        if chks != nil {
                *chks = (*chks)[:0]
        }

        d := encoding.Decbuf{B: b}

        k := d.Uvarint()

        for i := 0; i < k; i++ {
                lno := uint32(d.Uvarint())
                lvo := uint32(d.Uvarint())

                if d.Err() != nil {
                        return fmt.Errorf("read series label offsets: %w", d.Err())
                }

                ln, err := dec.LookupSymbol(context.TODO(), lno)
                if err != nil {
                        return fmt.Errorf("lookup label name: %w", err)
                }
                lv, err := dec.LookupSymbol(context.TODO(), lvo)
                if err != nil {
                        return fmt.Errorf("lookup label value: %w", err)
                }

                builder.Add(ln, lv)
        }

        // Skip reading chunks metadata if chks is nil.
        if chks == nil {
                return d.Err()
        }

        // Read the chunks meta data.
        k = d.Uvarint()

        if k == 0 {
                return d.Err()
        }

        t0 := d.Varint64()
        maxt := int64(d.Uvarint64()) + t0
        ref0 := int64(d.Uvarint64())

        *chks = append(*chks, chunks.Meta{
                Ref:     chunks.ChunkRef(ref0),
                MinTime: t0,
                MaxTime: maxt,
        })
        t0 = maxt

        for i := 1; i < k; i++ {
                mint := int64(d.Uvarint64()) + t0
                maxt := int64(d.Uvarint64()) + mint

                ref0 += d.Varint64()
                t0 = maxt

                if d.Err() != nil {
                        return fmt.Errorf("read meta for chunk %d: %w", i, d.Err())
                }

                *chks = append(*chks, chunks.Meta{
                        Ref:     chunks.ChunkRef(ref0),
                        MinTime: mint,
                        MaxTime: maxt,
                })
        }
        return d.Err()
}

func yoloString(b []byte) string {
        return *((*string)(unsafe.Pointer(&b)))
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package index

import (
        "container/heap"
        "context"
        "encoding/binary"
        "fmt"
        "math"
        "runtime"
        "slices"
        "sort"
        "strings"
        "sync"

        "github.com/bboreham/go-loser"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
)

var allPostingsKey = labels.Label{}

// AllPostingsKey returns the label key that is used to store the postings list of all existing IDs.
func AllPostingsKey() (name, value string) {
        return allPostingsKey.Name, allPostingsKey.Value
}

// ensureOrderBatchSize is the max number of postings passed to a worker in a single batch in MemPostings.EnsureOrder().
const ensureOrderBatchSize = 1024

// ensureOrderBatchPool is a pool used to recycle batches passed to workers in MemPostings.EnsureOrder().
var ensureOrderBatchPool = sync.Pool{
        New: func() interface{} {
                x := make([][]storage.SeriesRef, 0, ensureOrderBatchSize)
                return &x // Return pointer type as preferred by Pool.
        },
}

// MemPostings holds postings list for series ID per label pair. They may be written
// to out of order.
// EnsureOrder() must be called once before any reads are done. This allows for quick
// unordered batch fills on startup.
type MemPostings struct {
        mtx     sync.RWMutex
        m       map[string]map[string][]storage.SeriesRef
        ordered bool
}

// NewMemPostings returns a memPostings that's ready for reads and writes.
func NewMemPostings() *MemPostings {
        return &MemPostings{
                m:       make(map[string]map[string][]storage.SeriesRef, 512),
                ordered: true,
        }
}

// NewUnorderedMemPostings returns a memPostings that is not safe to be read from
// until EnsureOrder() was called once.
func NewUnorderedMemPostings() *MemPostings {
        return &MemPostings{
                m:       make(map[string]map[string][]storage.SeriesRef, 512),
                ordered: false,
        }
}

// Symbols returns an iterator over all unique name and value strings, in order.
func (p *MemPostings) Symbols() StringIter {
        p.mtx.RLock()

        // Add all the strings to a map to de-duplicate.
        symbols := make(map[string]struct{}, 512)
        for n, e := range p.m {
                symbols[n] = struct{}{}
                for v := range e {
                        symbols[v] = struct{}{}
                }
        }
        p.mtx.RUnlock()

        res := make([]string, 0, len(symbols))
        for k := range symbols {
                res = append(res, k)
        }

        slices.Sort(res)
        return NewStringListIter(res)
}

// SortedKeys returns a list of sorted label keys of the postings.
func (p *MemPostings) SortedKeys() []labels.Label {
        p.mtx.RLock()
        keys := make([]labels.Label, 0, len(p.m))

        for n, e := range p.m {
                for v := range e {
                        keys = append(keys, labels.Label{Name: n, Value: v})
                }
        }
        p.mtx.RUnlock()

        slices.SortFunc(keys, func(a, b labels.Label) int {
                nameCompare := strings.Compare(a.Name, b.Name)
                // If names are the same, compare values.
                if nameCompare != 0 {
                        return nameCompare
                }

                return strings.Compare(a.Value, b.Value)
        })
        return keys
}

// LabelNames returns all the unique label names.
func (p *MemPostings) LabelNames() []string {
        p.mtx.RLock()
        defer p.mtx.RUnlock()
        n := len(p.m)
        if n == 0 {
                return nil
        }

        names := make([]string, 0, n-1)
        for name := range p.m {
                if name != allPostingsKey.Name {
                        names = append(names, name)
                }
        }
        return names
}

// LabelValues returns label values for the given name.
func (p *MemPostings) LabelValues(_ context.Context, name string) []string {
        p.mtx.RLock()
        defer p.mtx.RUnlock()

        values := make([]string, 0, len(p.m[name]))
        for v := range p.m[name] {
                values = append(values, v)
        }
        return values
}

// PostingsStats contains cardinality based statistics for postings.
type PostingsStats struct {
        CardinalityMetricsStats []Stat
        CardinalityLabelStats   []Stat
        LabelValueStats         []Stat
        LabelValuePairsStats    []Stat
        NumLabelPairs           int
}

// Stats calculates the cardinality statistics from postings.
func (p *MemPostings) Stats(label string, limit int) *PostingsStats {
        var size uint64
        p.mtx.RLock()

        metrics := &maxHeap{}
        labels := &maxHeap{}
        labelValueLength := &maxHeap{}
        labelValuePairs := &maxHeap{}
        numLabelPairs := 0

        metrics.init(limit)
        labels.init(limit)
        labelValueLength.init(limit)
        labelValuePairs.init(limit)

        for n, e := range p.m {
                if n == "" {
                        continue
                }
                labels.push(Stat{Name: n, Count: uint64(len(e))})
                numLabelPairs += len(e)
                size = 0
                for name, values := range e {
                        if n == label {
                                metrics.push(Stat{Name: name, Count: uint64(len(values))})
                        }
                        seriesCnt := uint64(len(values))
                        labelValuePairs.push(Stat{Name: n + "=" + name, Count: seriesCnt})
                        size += uint64(len(name)) * seriesCnt
                }
                labelValueLength.push(Stat{Name: n, Count: size})
        }

        p.mtx.RUnlock()

        return &PostingsStats{
                CardinalityMetricsStats: metrics.get(),
                CardinalityLabelStats:   labels.get(),
                LabelValueStats:         labelValueLength.get(),
                LabelValuePairsStats:    labelValuePairs.get(),
                NumLabelPairs:           numLabelPairs,
        }
}

// Get returns a postings list for the given label pair.
func (p *MemPostings) Get(name, value string) Postings {
        var lp []storage.SeriesRef
        p.mtx.RLock()
        l := p.m[name]
        if l != nil {
                lp = l[value]
        }
        p.mtx.RUnlock()

        if lp == nil {
                return EmptyPostings()
        }
        return newListPostings(lp...)
}

// All returns a postings list over all documents ever added.
func (p *MemPostings) All() Postings {
        return p.Get(AllPostingsKey())
}

// EnsureOrder ensures that all postings lists are sorted. After it returns all further
// calls to add and addFor will insert new IDs in a sorted manner.
// Parameter numberOfConcurrentProcesses is used to specify the maximal number of
// CPU cores used for this operation. If it is <= 0, GOMAXPROCS is used.
// GOMAXPROCS was the default before introducing this parameter.
func (p *MemPostings) EnsureOrder(numberOfConcurrentProcesses int) {
        p.mtx.Lock()
        defer p.mtx.Unlock()

        if p.ordered {
                return
        }

        concurrency := numberOfConcurrentProcesses
        if concurrency <= 0 {
                concurrency = runtime.GOMAXPROCS(0)
        }
        workc := make(chan *[][]storage.SeriesRef)

        var wg sync.WaitGroup
        wg.Add(concurrency)

        for i := 0; i < concurrency; i++ {
                go func() {
                        for job := range workc {
                                for _, l := range *job {
                                        slices.Sort(l)
                                }

                                *job = (*job)[:0]
                                ensureOrderBatchPool.Put(job)
                        }
                        wg.Done()
                }()
        }

        nextJob := ensureOrderBatchPool.Get().(*[][]storage.SeriesRef)
        for _, e := range p.m {
                for _, l := range e {
                        *nextJob = append(*nextJob, l)

                        if len(*nextJob) >= ensureOrderBatchSize {
                                workc <- nextJob
                                nextJob = ensureOrderBatchPool.Get().(*[][]storage.SeriesRef)
                        }
                }
        }

        // If the last job was partially filled, we need to push it to workers too.
        if len(*nextJob) > 0 {
                workc <- nextJob
        }

        close(workc)
        wg.Wait()

        p.ordered = true
}

// Delete removes all ids in the given map from the postings lists.
// affectedLabels contains all the labels that are affected by the deletion, there's no need to check other labels.
func (p *MemPostings) Delete(deleted map[storage.SeriesRef]struct{}, affected map[labels.Label]struct{}) {
        p.mtx.Lock()
        defer p.mtx.Unlock()

        process := func(l labels.Label) {
                orig := p.m[l.Name][l.Value]
                repl := make([]storage.SeriesRef, 0, len(orig))
                for _, id := range orig {
                        if _, ok := deleted[id]; !ok {
                                repl = append(repl, id)
                        }
                }
                if len(repl) > 0 {
                        p.m[l.Name][l.Value] = repl
                } else {
                        delete(p.m[l.Name], l.Value)
                        // Delete the key if we removed all values.
                        if len(p.m[l.Name]) == 0 {
                                delete(p.m, l.Name)
                        }
                }
        }

        for l := range affected {
                process(l)
        }
        process(allPostingsKey)
}

// Iter calls f for each postings list. It aborts if f returns an error and returns it.
func (p *MemPostings) Iter(f func(labels.Label, Postings) error) error {
        p.mtx.RLock()
        defer p.mtx.RUnlock()

        for n, e := range p.m {
                for v, p := range e {
                        if err := f(labels.Label{Name: n, Value: v}, newListPostings(p...)); err != nil {
                                return err
                        }
                }
        }
        return nil
}

// Add a label set to the postings index.
func (p *MemPostings) Add(id storage.SeriesRef, lset labels.Labels) {
        p.mtx.Lock()

        lset.Range(func(l labels.Label) {
                p.addFor(id, l)
        })
        p.addFor(id, allPostingsKey)

        p.mtx.Unlock()
}

func (p *MemPostings) addFor(id storage.SeriesRef, l labels.Label) {
        nm, ok := p.m[l.Name]
        if !ok {
                nm = map[string][]storage.SeriesRef{}
                p.m[l.Name] = nm
        }
        list := append(nm[l.Value], id)
        nm[l.Value] = list

        if !p.ordered {
                return
        }
        // There is no guarantee that no higher ID was inserted before as they may
        // be generated independently before adding them to postings.
        // We repair order violations on insert. The invariant is that the first n-1
        // items in the list are already sorted.
        for i := len(list) - 1; i >= 1; i-- {
                if list[i] >= list[i-1] {
                        break
                }
                list[i], list[i-1] = list[i-1], list[i]
        }
}

func (p *MemPostings) PostingsForLabelMatching(ctx context.Context, name string, match func(string) bool) Postings {
        // We'll copy the values into a slice and then match over that,
        // this way we don't need to hold the mutex while we're matching,
        // which can be slow (seconds) if the match function is a huge regex.
        // Holding this lock prevents new series from being added (slows down the write path)
        // and blocks the compaction process.
        vals := p.labelValues(name)
        for i, count := 0, 1; i < len(vals); count++ {
                if count%checkContextEveryNIterations == 0 && ctx.Err() != nil {
                        return ErrPostings(ctx.Err())
                }

                if match(vals[i]) {
                        i++
                        continue
                }

                // Didn't match, bring the last value to this position, make the slice shorter and check again.
                // The order of the slice doesn't matter as it comes from a map iteration.
                vals[i], vals = vals[len(vals)-1], vals[:len(vals)-1]
        }

        // If none matched (or this label had no values), no need to grab the lock again.
        if len(vals) == 0 {
                return EmptyPostings()
        }

        // Now `vals` only contains the values that matched, get their postings.
        its := make([]Postings, 0, len(vals))
        p.mtx.RLock()
        e := p.m[name]
        for _, v := range vals {
                if refs, ok := e[v]; ok {
                        // Some of the values may have been garbage-collected in the meantime this is fine, we'll just skip them.
                        // If we didn't let the mutex go, we'd have these postings here, but they would be pointing nowhere
                        // because there would be a `MemPostings.Delete()` call waiting for the lock to delete these labels,
                        // because the series were deleted already.
                        its = append(its, NewListPostings(refs))
                }
        }
        // Let the mutex go before merging.
        p.mtx.RUnlock()

        return Merge(ctx, its...)
}

// labelValues returns a slice of label values for the given label name.
// It will take the read lock.
func (p *MemPostings) labelValues(name string) []string {
        p.mtx.RLock()
        defer p.mtx.RUnlock()

        e := p.m[name]
        if len(e) == 0 {
                return nil
        }

        vals := make([]string, 0, len(e))
        for v, srs := range e {
                if len(srs) > 0 {
                        vals = append(vals, v)
                }
        }

        return vals
}

// ExpandPostings returns the postings expanded as a slice.
func ExpandPostings(p Postings) (res []storage.SeriesRef, err error) {
        for p.Next() {
                res = append(res, p.At())
        }
        return res, p.Err()
}

// Postings provides iterative access over a postings list.
type Postings interface {
        // Next advances the iterator and returns true if another value was found.
        Next() bool

        // Seek advances the iterator to value v or greater and returns
        // true if a value was found.
        Seek(v storage.SeriesRef) bool

        // At returns the value at the current iterator position.
        // At should only be called after a successful call to Next or Seek.
        At() storage.SeriesRef

        // Err returns the last error of the iterator.
        Err() error
}

// errPostings is an empty iterator that always errors.
type errPostings struct {
        err error
}

func (e errPostings) Next() bool                  { return false }
func (e errPostings) Seek(storage.SeriesRef) bool { return false }
func (e errPostings) At() storage.SeriesRef       { return 0 }
func (e errPostings) Err() error                  { return e.err }

var emptyPostings = errPostings{}

// EmptyPostings returns a postings list that's always empty.
// NOTE: Returning EmptyPostings sentinel when Postings struct has no postings is recommended.
// It triggers optimized flow in other functions like Intersect, Without etc.
func EmptyPostings() Postings {
        return emptyPostings
}

// IsEmptyPostingsType returns true if the postings are an empty postings list.
// When this function returns false, it doesn't mean that the postings isn't empty
// (it could be an empty intersection of two non-empty postings, for example).
func IsEmptyPostingsType(p Postings) bool {
        return p == emptyPostings
}

// ErrPostings returns new postings that immediately error.
func ErrPostings(err error) Postings {
        return errPostings{err}
}

// Intersect returns a new postings list over the intersection of the
// input postings.
func Intersect(its ...Postings) Postings {
        if len(its) == 0 {
                return EmptyPostings()
        }
        if len(its) == 1 {
                return its[0]
        }
        for _, p := range its {
                if p == EmptyPostings() {
                        return EmptyPostings()
                }
        }

        return newIntersectPostings(its...)
}

type intersectPostings struct {
        arr []Postings
        cur storage.SeriesRef
}

func newIntersectPostings(its ...Postings) *intersectPostings {
        return &intersectPostings{arr: its}
}

func (it *intersectPostings) At() storage.SeriesRef {
        return it.cur
}

func (it *intersectPostings) doNext() bool {
Loop:
        for {
                for _, p := range it.arr {
                        if !p.Seek(it.cur) {
                                return false
                        }
                        if p.At() > it.cur {
                                it.cur = p.At()
                                continue Loop
                        }
                }
                return true
        }
}

func (it *intersectPostings) Next() bool {
        for _, p := range it.arr {
                if !p.Next() {
                        return false
                }
                if p.At() > it.cur {
                        it.cur = p.At()
                }
        }
        return it.doNext()
}

func (it *intersectPostings) Seek(id storage.SeriesRef) bool {
        it.cur = id
        return it.doNext()
}

func (it *intersectPostings) Err() error {
        for _, p := range it.arr {
                if p.Err() != nil {
                        return p.Err()
                }
        }
        return nil
}

// Merge returns a new iterator over the union of the input iterators.
func Merge(_ context.Context, its ...Postings) Postings {
        if len(its) == 0 {
                return EmptyPostings()
        }
        if len(its) == 1 {
                return its[0]
        }

        p, ok := newMergedPostings(its)
        if !ok {
                return EmptyPostings()
        }
        return p
}

type mergedPostings struct {
        p   []Postings
        h   *loser.Tree[storage.SeriesRef, Postings]
        cur storage.SeriesRef
}

func newMergedPostings(p []Postings) (m *mergedPostings, nonEmpty bool) {
        const maxVal = storage.SeriesRef(math.MaxUint64) // This value must be higher than all real values used in the tree.
        lt := loser.New(p, maxVal)
        return &mergedPostings{p: p, h: lt}, true
}

func (it *mergedPostings) Next() bool {
        for {
                if !it.h.Next() {
                        return false
                }
                // Remove duplicate entries.
                newItem := it.h.At()
                if newItem != it.cur {
                        it.cur = newItem
                        return true
                }
        }
}

func (it *mergedPostings) Seek(id storage.SeriesRef) bool {
        for !it.h.IsEmpty() && it.h.At() < id {
                finished := !it.h.Winner().Seek(id)
                it.h.Fix(finished)
        }
        if it.h.IsEmpty() {
                return false
        }
        it.cur = it.h.At()
        return true
}

func (it mergedPostings) At() storage.SeriesRef {
        return it.cur
}

func (it mergedPostings) Err() error {
        for _, p := range it.p {
                if err := p.Err(); err != nil {
                        return err
                }
        }
        return nil
}

// Without returns a new postings list that contains all elements from the full list that
// are not in the drop list.
func Without(full, drop Postings) Postings {
        if full == EmptyPostings() {
                return EmptyPostings()
        }

        if drop == EmptyPostings() {
                return full
        }
        return newRemovedPostings(full, drop)
}

type removedPostings struct {
        full, remove Postings

        cur storage.SeriesRef

        initialized bool
        fok, rok    bool
}

func newRemovedPostings(full, remove Postings) *removedPostings {
        return &removedPostings{
                full:   full,
                remove: remove,
        }
}

func (rp *removedPostings) At() storage.SeriesRef {
        return rp.cur
}

func (rp *removedPostings) Next() bool {
        if !rp.initialized {
                rp.fok = rp.full.Next()
                rp.rok = rp.remove.Next()
                rp.initialized = true
        }
        for {
                if !rp.fok {
                        return false
                }

                if !rp.rok {
                        rp.cur = rp.full.At()
                        rp.fok = rp.full.Next()
                        return true
                }
                switch fcur, rcur := rp.full.At(), rp.remove.At(); {
                case fcur < rcur:
                        rp.cur = fcur
                        rp.fok = rp.full.Next()

                        return true
                case rcur < fcur:
                        // Forward the remove postings to the right position.
                        rp.rok = rp.remove.Seek(fcur)
                default:
                        // Skip the current posting.
                        rp.fok = rp.full.Next()
                }
        }
}

func (rp *removedPostings) Seek(id storage.SeriesRef) bool {
        if rp.cur >= id {
                return true
        }

        rp.fok = rp.full.Seek(id)
        rp.rok = rp.remove.Seek(id)
        rp.initialized = true

        return rp.Next()
}

func (rp *removedPostings) Err() error {
        if rp.full.Err() != nil {
                return rp.full.Err()
        }

        return rp.remove.Err()
}

// ListPostings implements the Postings interface over a plain list.
type ListPostings struct {
        list []storage.SeriesRef
        cur  storage.SeriesRef
}

func NewListPostings(list []storage.SeriesRef) Postings {
        return newListPostings(list...)
}

func newListPostings(list ...storage.SeriesRef) *ListPostings {
        return &ListPostings{list: list}
}

func (it *ListPostings) At() storage.SeriesRef {
        return it.cur
}

func (it *ListPostings) Next() bool {
        if len(it.list) > 0 {
                it.cur = it.list[0]
                it.list = it.list[1:]
                return true
        }
        it.cur = 0
        return false
}

func (it *ListPostings) Seek(x storage.SeriesRef) bool {
        // If the current value satisfies, then return.
        if it.cur >= x {
                return true
        }
        if len(it.list) == 0 {
                return false
        }

        // Do binary search between current position and end.
        i := sort.Search(len(it.list), func(i int) bool {
                return it.list[i] >= x
        })
        if i < len(it.list) {
                it.cur = it.list[i]
                it.list = it.list[i+1:]
                return true
        }
        it.list = nil
        return false
}

func (it *ListPostings) Err() error {
        return nil
}

// bigEndianPostings implements the Postings interface over a byte stream of
// big endian numbers.
type bigEndianPostings struct {
        list []byte
        cur  uint32
}

func newBigEndianPostings(list []byte) *bigEndianPostings {
        return &bigEndianPostings{list: list}
}

func (it *bigEndianPostings) At() storage.SeriesRef {
        return storage.SeriesRef(it.cur)
}

func (it *bigEndianPostings) Next() bool {
        if len(it.list) >= 4 {
                it.cur = binary.BigEndian.Uint32(it.list)
                it.list = it.list[4:]
                return true
        }
        return false
}

func (it *bigEndianPostings) Seek(x storage.SeriesRef) bool {
        if storage.SeriesRef(it.cur) >= x {
                return true
        }

        num := len(it.list) / 4
        // Do binary search between current position and end.
        i := sort.Search(num, func(i int) bool {
                return binary.BigEndian.Uint32(it.list[i*4:]) >= uint32(x)
        })
        if i < num {
                j := i * 4
                it.cur = binary.BigEndian.Uint32(it.list[j:])
                it.list = it.list[j+4:]
                return true
        }
        it.list = nil
        return false
}

func (it *bigEndianPostings) Err() error {
        return nil
}

// FindIntersectingPostings checks the intersection of p and candidates[i] for each i in candidates,
// if intersection is non empty, then i is added to the indexes returned.
// Returned indexes are not sorted.
func FindIntersectingPostings(p Postings, candidates []Postings) (indexes []int, err error) {
        h := make(postingsWithIndexHeap, 0, len(candidates))
        for idx, it := range candidates {
                switch {
                case it.Next():
                        h = append(h, postingsWithIndex{index: idx, p: it})
                case it.Err() != nil:
                        return nil, it.Err()
                }
        }
        if h.empty() {
                return nil, nil
        }
        heap.Init(&h)

        for !h.empty() {
                if !p.Seek(h.at()) {
                        return indexes, p.Err()
                }
                if p.At() == h.at() {
                        indexes = append(indexes, h.popIndex())
                } else if err := h.next(); err != nil {
                        return nil, err
                }
        }

        return indexes, nil
}

// postingsWithIndex is used as postingsWithIndexHeap elements by FindIntersectingPostings,
// keeping track of the original index of each postings while they move inside the heap.
type postingsWithIndex struct {
        index int
        p     Postings
        // popped means that these postings shouldn't be considered anymore.
        // See popIndex() comment to understand why we need this.
        popped bool
}

// postingsWithIndexHeap implements heap.Interface,
// with root always pointing to the postings with minimum Postings.At() value.
// It also implements a special way of removing elements that marks them as popped and moves them to the bottom of the
// heap instead of actually removing them, see popIndex() for more details.
type postingsWithIndexHeap []postingsWithIndex

// empty checks whether the heap is empty, which is true if it has no elements, of if the smallest element is popped.
func (h *postingsWithIndexHeap) empty() bool {
        return len(*h) == 0 || (*h)[0].popped
}

// popIndex pops the smallest heap element and returns its index.
// In our implementation we don't actually do heap.Pop(), instead we mark the element as `popped` and fix its position, which
// should be after all the non-popped elements according to our sorting strategy.
// By skipping the `heap.Pop()` call we avoid an extra allocation in this heap's Pop() implementation which returns an interface{}.
func (h *postingsWithIndexHeap) popIndex() int {
        index := (*h)[0].index
        (*h)[0].popped = true
        heap.Fix(h, 0)
        return index
}

// at provides the storage.SeriesRef where root Postings is pointing at this moment.
func (h postingsWithIndexHeap) at() storage.SeriesRef { return h[0].p.At() }

// next performs the Postings.Next() operation on the root of the heap, performing the related operation on the heap
// and conveniently returning the result of calling Postings.Err() if the result of calling Next() was false.
// If Next() succeeds, heap is fixed to move the root to its new position, according to its Postings.At() value.
// If Next() returns fails and there's no error reported by Postings.Err(), then root is marked as removed and heap is fixed.
func (h *postingsWithIndexHeap) next() error {
        pi := (*h)[0]
        next := pi.p.Next()
        if next {
                heap.Fix(h, 0)
                return nil
        }

        if err := pi.p.Err(); err != nil {
                return fmt.Errorf("postings %d: %w", pi.index, err)
        }
        h.popIndex()
        return nil
}

// Len implements heap.Interface.
// Notice that Len() > 0 does not imply that heap is not empty as elements are not removed from this heap.
// Use empty() to check whether heap is empty or not.
func (h postingsWithIndexHeap) Len() int { return len(h) }

// Less implements heap.Interface, it puts all the popped elements at the bottom,
// and then sorts by Postings.At() property of each node.
func (h postingsWithIndexHeap) Less(i, j int) bool {
        if h[i].popped != h[j].popped {
                return h[j].popped
        }
        return h[i].p.At() < h[j].p.At()
}

// Swap implements heap.Interface.
func (h *postingsWithIndexHeap) Swap(i, j int) { (*h)[i], (*h)[j] = (*h)[j], (*h)[i] }

// Push implements heap.Interface.
func (h *postingsWithIndexHeap) Push(x interface{}) {
        *h = append(*h, x.(postingsWithIndex))
}

// Pop implements heap.Interface and pops the last element, which is NOT the min element,
// so this doesn't return the same heap.Pop()
// Although this method is implemented for correctness, we don't expect it to be used, see popIndex() method for details.
func (h *postingsWithIndexHeap) Pop() interface{} {
        old := *h
        n := len(old)
        x := old[n-1]
        *h = old[0 : n-1]
        return x
}

// Copyright 2019 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package index

import (
        "math"
        "slices"
)

// Stat holds values for a single cardinality statistic.
type Stat struct {
        Name  string
        Count uint64
}

type maxHeap struct {
        maxLength int
        minValue  uint64
        minIndex  int
        Items     []Stat
}

func (m *maxHeap) init(length int) {
        m.maxLength = length
        m.minValue = math.MaxUint64
        m.Items = make([]Stat, 0, length)
}

func (m *maxHeap) push(item Stat) {
        if len(m.Items) < m.maxLength {
                if item.Count < m.minValue {
                        m.minValue = item.Count
                        m.minIndex = len(m.Items)
                }
                m.Items = append(m.Items, item)
                return
        }
        if item.Count < m.minValue {
                return
        }

        m.Items[m.minIndex] = item
        m.minValue = item.Count

        for i, stat := range m.Items {
                if stat.Count < m.minValue {
                        m.minValue = stat.Count
                        m.minIndex = i
                }
        }
}

func (m *maxHeap) get() []Stat {
        slices.SortFunc(m.Items, func(a, b Stat) int {
                switch {
                case b.Count < a.Count:
                        return -1
                case b.Count > a.Count:
                        return 1
                default:
                        return 0
                }
        })
        return m.Items
}

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "math"
        "sync"
)

// isolationState holds the isolation information.
type isolationState struct {
        // We will ignore all appends above the max, or that are incomplete.
        maxAppendID       uint64
        incompleteAppends map[uint64]struct{}
        lowWatermark      uint64 // Lowest of incompleteAppends/maxAppendID.
        isolation         *isolation
        mint, maxt        int64 // Time ranges of the read.

        // Doubly linked list of active reads.
        next *isolationState
        prev *isolationState
}

// Close closes the state.
func (i *isolationState) Close() {
        i.isolation.readMtx.Lock()
        defer i.isolation.readMtx.Unlock()
        i.next.prev = i.prev
        i.prev.next = i.next
}

func (i *isolationState) IsolationDisabled() bool {
        return i.isolation.disabled
}

type isolationAppender struct {
        appendID uint64
        minTime  int64
        prev     *isolationAppender
        next     *isolationAppender
}

// isolation is the global isolation state.
type isolation struct {
        // Mutex for accessing lastAppendID and appendsOpen.
        appendMtx sync.RWMutex
        // Which appends are currently in progress.
        appendsOpen map[uint64]*isolationAppender
        // New appenders with higher appendID are added to the end. First element keeps lastAppendId.
        // appendsOpenList.next points to the first element and appendsOpenList.prev points to the last element.
        // If there are no appenders, both point back to appendsOpenList.
        appendsOpenList *isolationAppender
        // Pool of reusable *isolationAppender to save on allocations.
        appendersPool sync.Pool

        // Mutex for accessing readsOpen.
        // If taking both appendMtx and readMtx, take appendMtx first.
        readMtx sync.RWMutex
        // All current in use isolationStates. This is a doubly-linked list.
        readsOpen *isolationState
        // If true, writes are not tracked while reads are still tracked.
        disabled bool
}

func newIsolation(disabled bool) *isolation {
        isoState := &isolationState{}
        isoState.next = isoState
        isoState.prev = isoState

        appender := &isolationAppender{}
        appender.next = appender
        appender.prev = appender

        return &isolation{
                appendsOpen:     map[uint64]*isolationAppender{},
                appendsOpenList: appender,
                readsOpen:       isoState,
                disabled:        disabled,
                appendersPool:   sync.Pool{New: func() interface{} { return &isolationAppender{} }},
        }
}

// lowWatermark returns the appendID below which we no longer need to track
// which appends were from which appendID.
func (i *isolation) lowWatermark() uint64 {
        if i.disabled {
                return 0
        }

        i.appendMtx.RLock() // Take appendMtx first.
        defer i.appendMtx.RUnlock()
        return i.lowWatermarkLocked()
}

func (i *isolation) lowWatermarkLocked() uint64 {
        if i.disabled {
                return 0
        }

        i.readMtx.RLock()
        defer i.readMtx.RUnlock()
        if i.readsOpen.prev != i.readsOpen {
                return i.readsOpen.prev.lowWatermark
        }

        // Lowest appendID from appenders, or lastAppendId.
        return i.appendsOpenList.next.appendID
}

// lowestAppendTime returns the lowest minTime for any open appender,
// or math.MaxInt64 if no open appenders.
func (i *isolation) lowestAppendTime() int64 {
        var lowest int64 = math.MaxInt64
        i.appendMtx.RLock()
        defer i.appendMtx.RUnlock()

        for a := i.appendsOpenList.next; a != i.appendsOpenList; a = a.next {
                if lowest > a.minTime {
                        lowest = a.minTime
                }
        }
        return lowest
}

// State returns an object used to control isolation
// between a query and appends. Must be closed when complete.
func (i *isolation) State(mint, maxt int64) *isolationState {
        i.appendMtx.RLock() // Take append mutex before read mutex.
        defer i.appendMtx.RUnlock()

        // We need to track reads even when isolation is disabled, so that head
        // truncation can wait till reads overlapping that range have finished.
        isoState := &isolationState{
                maxAppendID:       i.appendsOpenList.appendID,
                lowWatermark:      i.appendsOpenList.next.appendID, // Lowest appendID from appenders, or lastAppendId.
                incompleteAppends: make(map[uint64]struct{}, len(i.appendsOpen)),
                isolation:         i,
                mint:              mint,
                maxt:              maxt,
        }
        for k := range i.appendsOpen {
                isoState.incompleteAppends[k] = struct{}{}
        }

        i.readMtx.Lock()
        defer i.readMtx.Unlock()
        isoState.prev = i.readsOpen
        isoState.next = i.readsOpen.next
        i.readsOpen.next.prev = isoState
        i.readsOpen.next = isoState

        return isoState
}

// TraverseOpenReads iterates through the open reads and runs the given
// function on those states. The given function MUST NOT mutate the isolationState.
// The iteration is stopped when the function returns false or once all reads have been iterated.
func (i *isolation) TraverseOpenReads(f func(s *isolationState) bool) {
        i.readMtx.RLock()
        defer i.readMtx.RUnlock()
        s := i.readsOpen.next
        for s != i.readsOpen {
                if !f(s) {
                        return
                }
                s = s.next
        }
}

// newAppendID increments the transaction counter and returns a new transaction
// ID. The first ID returned is 1.
// Also returns the low watermark, to keep lock/unlock operations down.
func (i *isolation) newAppendID(minTime int64) (uint64, uint64) {
        if i.disabled {
                return 0, 0
        }

        i.appendMtx.Lock()
        defer i.appendMtx.Unlock()

        // Last used appendID is stored in head element.
        i.appendsOpenList.appendID++

        app := i.appendersPool.Get().(*isolationAppender)
        app.appendID = i.appendsOpenList.appendID
        app.minTime = minTime
        app.prev = i.appendsOpenList.prev
        app.next = i.appendsOpenList

        i.appendsOpenList.prev.next = app
        i.appendsOpenList.prev = app

        i.appendsOpen[app.appendID] = app
        return app.appendID, i.lowWatermarkLocked()
}

func (i *isolation) lastAppendID() uint64 {
        if i.disabled {
                return 0
        }

        i.appendMtx.RLock()
        defer i.appendMtx.RUnlock()

        return i.appendsOpenList.appendID
}

func (i *isolation) closeAppend(appendID uint64) {
        if i.disabled {
                return
        }

        i.appendMtx.Lock()
        defer i.appendMtx.Unlock()

        app := i.appendsOpen[appendID]
        if app != nil {
                app.prev.next = app.next
                app.next.prev = app.prev

                delete(i.appendsOpen, appendID)

                // Clear all fields, and return to the pool.
                *app = isolationAppender{}
                i.appendersPool.Put(app)
        }
}

// The transactionID ring buffer.
type txRing struct {
        txIDs     []uint64
        txIDFirst uint32 // Position of the first id in the ring.
        txIDCount uint32 // How many ids in the ring.
}

func newTxRing(capacity int) *txRing {
        return &txRing{
                txIDs: make([]uint64, capacity),
        }
}

func (txr *txRing) add(appendID uint64) {
        if int(txr.txIDCount) == len(txr.txIDs) {
                // Ring buffer is full, expand by doubling.
                newLen := txr.txIDCount * 2
                if newLen == 0 {
                        newLen = 4
                }
                newRing := make([]uint64, newLen)
                idx := copy(newRing, txr.txIDs[txr.txIDFirst:])
                copy(newRing[idx:], txr.txIDs[:txr.txIDFirst])
                txr.txIDs = newRing
                txr.txIDFirst = 0
        }

        txr.txIDs[int(txr.txIDFirst+txr.txIDCount)%len(txr.txIDs)] = appendID
        txr.txIDCount++
}

func (txr *txRing) cleanupAppendIDsBelow(bound uint64) {
        if len(txr.txIDs) == 0 {
                return
        }
        pos := int(txr.txIDFirst)

        for txr.txIDCount > 0 {
                if txr.txIDs[pos] < bound {
                        txr.txIDFirst++
                        txr.txIDCount--
                } else {
                        break
                }

                pos++
                if pos == len(txr.txIDs) {
                        pos = 0
                }
        }

        txr.txIDFirst %= uint32(len(txr.txIDs))
}

func (txr *txRing) iterator() *txRingIterator {
        return &txRingIterator{
                pos: txr.txIDFirst,
                ids: txr.txIDs,
        }
}

// txRingIterator lets you iterate over the ring. It doesn't terminate,
// it DOESN'T terminate.
type txRingIterator struct {
        ids []uint64

        pos uint32
}

func (it *txRingIterator) At() uint64 {
        return it.ids[it.pos]
}

func (it *txRingIterator) Next() {
        it.pos++
        if int(it.pos) == len(it.ids) {
                it.pos = 0
        }
}

// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "fmt"
        "sort"

        "github.com/oklog/ulid"

        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        "github.com/prometheus/prometheus/tsdb/tombstones"
)

// OOOChunk maintains samples in time-ascending order.
// Inserts for timestamps already seen, are dropped.
// Samples are stored uncompressed to allow easy sorting.
// Perhaps we can be more efficient later.
type OOOChunk struct {
        samples []sample
}

func NewOOOChunk() *OOOChunk {
        return &OOOChunk{samples: make([]sample, 0, 4)}
}

// Insert inserts the sample such that order is maintained.
// Returns false if insert was not possible due to the same timestamp already existing.
func (o *OOOChunk) Insert(t int64, v float64) bool {
        // Although out-of-order samples can be out-of-order amongst themselves, we
        // are opinionated and expect them to be usually in-order meaning we could
        // try to append at the end first if the new timestamp is higher than the
        // last known timestamp.
        if len(o.samples) == 0 || t > o.samples[len(o.samples)-1].t {
                o.samples = append(o.samples, sample{t, v, nil, nil})
                return true
        }

        // Find index of sample we should replace.
        i := sort.Search(len(o.samples), func(i int) bool { return o.samples[i].t >= t })

        if i >= len(o.samples) {
                // none found. append it at the end
                o.samples = append(o.samples, sample{t, v, nil, nil})
                return true
        }

        // Duplicate sample for timestamp is not allowed.
        if o.samples[i].t == t {
                return false
        }

        // Expand length by 1 to make room. use a zero sample, we will overwrite it anyway.
        o.samples = append(o.samples, sample{})
        copy(o.samples[i+1:], o.samples[i:])
        o.samples[i] = sample{t, v, nil, nil}

        return true
}

func (o *OOOChunk) NumSamples() int {
        return len(o.samples)
}

func (o *OOOChunk) ToXOR() (*chunkenc.XORChunk, error) {
        x := chunkenc.NewXORChunk()
        app, err := x.Appender()
        if err != nil {
                return nil, err
        }
        for _, s := range o.samples {
                app.Append(s.t, s.f)
        }
        return x, nil
}

func (o *OOOChunk) ToXORBetweenTimestamps(mint, maxt int64) (*chunkenc.XORChunk, error) {
        x := chunkenc.NewXORChunk()
        app, err := x.Appender()
        if err != nil {
                return nil, err
        }
        for _, s := range o.samples {
                if s.t < mint {
                        continue
                }
                if s.t > maxt {
                        break
                }
                app.Append(s.t, s.f)
        }
        return x, nil
}

var _ BlockReader = &OOORangeHead{}

// OOORangeHead allows querying Head out of order samples via BlockReader
// interface implementation.
type OOORangeHead struct {
        head *Head
        // mint and maxt are tracked because when a query is handled we only want
        // the timerange of the query and having preexisting pointers to the first
        // and last timestamp help with that.
        mint, maxt int64

        isoState *oooIsolationState
}

func NewOOORangeHead(head *Head, mint, maxt int64, minRef chunks.ChunkDiskMapperRef) *OOORangeHead {
        isoState := head.oooIso.TrackReadAfter(minRef)

        return &OOORangeHead{
                head:     head,
                mint:     mint,
                maxt:     maxt,
                isoState: isoState,
        }
}

func (oh *OOORangeHead) Index() (IndexReader, error) {
        return NewOOOHeadIndexReader(oh.head, oh.mint, oh.maxt, oh.isoState.minRef), nil
}

func (oh *OOORangeHead) Chunks() (ChunkReader, error) {
        return NewOOOHeadChunkReader(oh.head, oh.mint, oh.maxt, oh.isoState), nil
}

func (oh *OOORangeHead) Tombstones() (tombstones.Reader, error) {
        // As stated in the design doc https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing
        // Tombstones are not supported for out of order metrics.
        return tombstones.NewMemTombstones(), nil
}

var oooRangeHeadULID = ulid.MustParse("0000000000XXXX000RANGEHEAD")

func (oh *OOORangeHead) Meta() BlockMeta {
        return BlockMeta{
                MinTime: oh.mint,
                MaxTime: oh.maxt,
                ULID:    oooRangeHeadULID,
                Stats: BlockStats{
                        NumSeries: oh.head.NumSeries(),
                },
        }
}

// Size returns the size taken by the Head block.
func (oh *OOORangeHead) Size() int64 {
        return oh.head.Size()
}

// String returns an human readable representation of the out of order range
// head. It's important to keep this function in order to avoid the struct dump
// when the head is stringified in errors or logs.
func (oh *OOORangeHead) String() string {
        return fmt.Sprintf("ooo range head (mint: %d, maxt: %d)", oh.MinTime(), oh.MaxTime())
}

func (oh *OOORangeHead) MinTime() int64 {
        return oh.mint
}

func (oh *OOORangeHead) MaxTime() int64 {
        return oh.maxt
}

// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "errors"
        "math"
        "slices"

        "github.com/oklog/ulid"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        "github.com/prometheus/prometheus/tsdb/index"
        "github.com/prometheus/prometheus/tsdb/tombstones"
)

var _ IndexReader = &OOOHeadIndexReader{}

// OOOHeadIndexReader implements IndexReader so ooo samples in the head can be
// accessed.
// It also has a reference to headIndexReader so we can leverage on its
// IndexReader implementation for all the methods that remain the same. We
// decided to do this to avoid code duplication.
// The only methods that change are the ones about getting Series and Postings.
type OOOHeadIndexReader struct {
        *headIndexReader            // A reference to the headIndexReader so we can reuse as many interface implementation as possible.
        lastGarbageCollectedMmapRef chunks.ChunkDiskMapperRef
}

var _ chunkenc.Iterable = &mergedOOOChunks{}

// mergedOOOChunks holds the list of iterables for overlapping chunks.
type mergedOOOChunks struct {
        chunkIterables []chunkenc.Iterable
}

func (o mergedOOOChunks) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
        return storage.ChainSampleIteratorFromIterables(iterator, o.chunkIterables)
}

func NewOOOHeadIndexReader(head *Head, mint, maxt int64, lastGarbageCollectedMmapRef chunks.ChunkDiskMapperRef) *OOOHeadIndexReader {
        hr := &headIndexReader{
                head: head,
                mint: mint,
                maxt: maxt,
        }
        return &OOOHeadIndexReader{hr, lastGarbageCollectedMmapRef}
}

func (oh *OOOHeadIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
        return oh.series(ref, builder, chks, oh.lastGarbageCollectedMmapRef, 0)
}

// lastGarbageCollectedMmapRef gives the last mmap chunk that may be being garbage collected and so
// any chunk at or before this ref will not be considered. 0 disables this check.
//
// maxMmapRef tells upto what max m-map chunk that we can consider. If it is non-0, then
// the oooHeadChunk will not be considered.
func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta, lastGarbageCollectedMmapRef, maxMmapRef chunks.ChunkDiskMapperRef) error {
        s := oh.head.series.getByID(chunks.HeadSeriesRef(ref))

        if s == nil {
                oh.head.metrics.seriesNotFound.Inc()
                return storage.ErrNotFound
        }
        builder.Assign(s.lset)

        if chks == nil {
                return nil
        }

        s.Lock()
        defer s.Unlock()
        *chks = (*chks)[:0]

        if s.ooo == nil {
                return nil
        }

        tmpChks := make([]chunks.Meta, 0, len(s.ooo.oooMmappedChunks))

        // We define these markers to track the last chunk reference while we
        // fill the chunk meta.
        // These markers are useful to give consistent responses to repeated queries
        // even if new chunks that might be overlapping or not are added afterwards.
        // Also, lastMinT and lastMaxT are initialized to the max int as a sentinel
        // value to know they are unset.
        var lastChunkRef chunks.ChunkRef
        lastMinT, lastMaxT := int64(math.MaxInt64), int64(math.MaxInt64)

        addChunk := func(minT, maxT int64, ref chunks.ChunkRef) {
                // the first time we get called is for the last included chunk.
                // set the markers accordingly
                if lastMinT == int64(math.MaxInt64) {
                        lastChunkRef = ref
                        lastMinT = minT
                        lastMaxT = maxT
                }

                tmpChks = append(tmpChks, chunks.Meta{
                        MinTime:        minT,
                        MaxTime:        maxT,
                        Ref:            ref,
                        OOOLastRef:     lastChunkRef,
                        OOOLastMinTime: lastMinT,
                        OOOLastMaxTime: lastMaxT,
                })
        }

        // Collect all chunks that overlap the query range, in order from most recent to most old,
        // so we can set the correct markers.
        if s.ooo.oooHeadChunk != nil {
                c := s.ooo.oooHeadChunk
                if c.OverlapsClosedInterval(oh.mint, oh.maxt) && maxMmapRef == 0 {
                        ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.ooo.oooMmappedChunks))))
                        addChunk(c.minTime, c.maxTime, ref)
                }
        }
        for i := len(s.ooo.oooMmappedChunks) - 1; i >= 0; i-- {
                c := s.ooo.oooMmappedChunks[i]
                if c.OverlapsClosedInterval(oh.mint, oh.maxt) && (maxMmapRef == 0 || maxMmapRef.GreaterThanOrEqualTo(c.ref)) && (lastGarbageCollectedMmapRef == 0 || c.ref.GreaterThan(lastGarbageCollectedMmapRef)) {
                        ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
                        addChunk(c.minTime, c.maxTime, ref)
                }
        }

        // There is nothing to do if we did not collect any chunk.
        if len(tmpChks) == 0 {
                return nil
        }

        // Next we want to sort all the collected chunks by min time so we can find
        // those that overlap.
        slices.SortFunc(tmpChks, lessByMinTimeAndMinRef)

        // Next we want to iterate the sorted collected chunks and only return the
        // chunks Meta the first chunk that overlaps with others.
        // Example chunks of a series: 5:(100, 200) 6:(500, 600) 7:(150, 250) 8:(550, 650)
        // In the example 5 overlaps with 7 and 6 overlaps with 8 so we only want to
        // return chunk Metas for chunk 5 and chunk 6e
        *chks = append(*chks, tmpChks[0])
        maxTime := tmpChks[0].MaxTime // Tracks the maxTime of the previous "to be merged chunk".
        for _, c := range tmpChks[1:] {
                switch {
                case c.MinTime > maxTime:
                        *chks = append(*chks, c)
                        maxTime = c.MaxTime
                case c.MaxTime > maxTime:
                        maxTime = c.MaxTime
                        (*chks)[len(*chks)-1].MaxTime = c.MaxTime
                }
        }

        return nil
}

// LabelValues needs to be overridden from the headIndexReader implementation due
// to the check that happens at the beginning where we make sure that the query
// interval overlaps with the head minooot and maxooot.
func (oh *OOOHeadIndexReader) LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
        if oh.maxt < oh.head.MinOOOTime() || oh.mint > oh.head.MaxOOOTime() {
                return []string{}, nil
        }

        if len(matchers) == 0 {
                return oh.head.postings.LabelValues(ctx, name), nil
        }

        return labelValuesWithMatchers(ctx, oh, name, matchers...)
}

type chunkMetaAndChunkDiskMapperRef struct {
        meta     chunks.Meta
        ref      chunks.ChunkDiskMapperRef
        origMinT int64
        origMaxT int64
}

func refLessByMinTimeAndMinRef(a, b chunkMetaAndChunkDiskMapperRef) int {
        switch {
        case a.meta.MinTime < b.meta.MinTime:
                return -1
        case a.meta.MinTime > b.meta.MinTime:
                return 1
        }

        switch {
        case a.meta.Ref < b.meta.Ref:
                return -1
        case a.meta.Ref > b.meta.Ref:
                return 1
        default:
                return 0
        }
}

func lessByMinTimeAndMinRef(a, b chunks.Meta) int {
        switch {
        case a.MinTime < b.MinTime:
                return -1
        case a.MinTime > b.MinTime:
                return 1
        }

        switch {
        case a.Ref < b.Ref:
                return -1
        case a.Ref > b.Ref:
                return 1
        default:
                return 0
        }
}

func (oh *OOOHeadIndexReader) Postings(ctx context.Context, name string, values ...string) (index.Postings, error) {
        switch len(values) {
        case 0:
                return index.EmptyPostings(), nil
        case 1:
                return oh.head.postings.Get(name, values[0]), nil // TODO(ganesh) Also call GetOOOPostings
        default:
                // TODO(ganesh) We want to only return postings for out of order series.
                res := make([]index.Postings, 0, len(values))
                for _, value := range values {
                        res = append(res, oh.head.postings.Get(name, value)) // TODO(ganesh) Also call GetOOOPostings
                }
                return index.Merge(ctx, res...), nil
        }
}

type OOOHeadChunkReader struct {
        head       *Head
        mint, maxt int64
        isoState   *oooIsolationState
}

func NewOOOHeadChunkReader(head *Head, mint, maxt int64, isoState *oooIsolationState) *OOOHeadChunkReader {
        return &OOOHeadChunkReader{
                head:     head,
                mint:     mint,
                maxt:     maxt,
                isoState: isoState,
        }
}

func (cr OOOHeadChunkReader) ChunkOrIterable(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
        sid, _ := chunks.HeadChunkRef(meta.Ref).Unpack()

        s := cr.head.series.getByID(sid)
        // This means that the series has been garbage collected.
        if s == nil {
                return nil, nil, storage.ErrNotFound
        }

        s.Lock()
        if s.ooo == nil {
                // There is no OOO data for this series.
                s.Unlock()
                return nil, nil, storage.ErrNotFound
        }
        mc, err := s.oooMergedChunks(meta, cr.head.chunkDiskMapper, cr.mint, cr.maxt)
        s.Unlock()
        if err != nil {
                return nil, nil, err
        }

        // This means that the query range did not overlap with the requested chunk.
        if len(mc.chunkIterables) == 0 {
                return nil, nil, storage.ErrNotFound
        }

        return nil, mc, nil
}

func (cr OOOHeadChunkReader) Close() error {
        if cr.isoState != nil {
                cr.isoState.Close()
        }
        return nil
}

type OOOCompactionHead struct {
        oooIR       *OOOHeadIndexReader
        lastMmapRef chunks.ChunkDiskMapperRef
        lastWBLFile int
        postings    []storage.SeriesRef
        chunkRange  int64
        mint, maxt  int64 // Among all the compactable chunks.
}

// NewOOOCompactionHead does the following:
// 1. M-maps all the in-memory ooo chunks.
// 2. Compute the expected block ranges while iterating through all ooo series and store it.
// 3. Store the list of postings having ooo series.
// 4. Cuts a new WBL file for the OOO WBL.
// All the above together have a bit of CPU and memory overhead, and can have a bit of impact
// on the sample append latency. So call NewOOOCompactionHead only right before compaction.
func NewOOOCompactionHead(ctx context.Context, head *Head) (*OOOCompactionHead, error) {
        ch := &OOOCompactionHead{
                chunkRange: head.chunkRange.Load(),
                mint:       math.MaxInt64,
                maxt:       math.MinInt64,
        }

        if head.wbl != nil {
                lastWBLFile, err := head.wbl.NextSegmentSync()
                if err != nil {
                        return nil, err
                }
                ch.lastWBLFile = lastWBLFile
        }

        ch.oooIR = NewOOOHeadIndexReader(head, math.MinInt64, math.MaxInt64, 0)
        n, v := index.AllPostingsKey()

        // TODO: verify this gets only ooo samples.
        p, err := ch.oooIR.Postings(ctx, n, v)
        if err != nil {
                return nil, err
        }
        p = ch.oooIR.SortedPostings(p)

        var lastSeq, lastOff int
        for p.Next() {
                seriesRef := p.At()
                ms := head.series.getByID(chunks.HeadSeriesRef(seriesRef))
                if ms == nil {
                        continue
                }

                // M-map the in-memory chunk and keep track of the last one.
                // Also build the block ranges -> series map.
                // TODO: consider having a lock specifically for ooo data.
                ms.Lock()

                if ms.ooo == nil {
                        ms.Unlock()
                        continue
                }

                mmapRef := ms.mmapCurrentOOOHeadChunk(head.chunkDiskMapper)
                if mmapRef == 0 && len(ms.ooo.oooMmappedChunks) > 0 {
                        // Nothing was m-mapped. So take the mmapRef from the existing slice if it exists.
                        mmapRef = ms.ooo.oooMmappedChunks[len(ms.ooo.oooMmappedChunks)-1].ref
                }
                seq, off := mmapRef.Unpack()
                if seq > lastSeq || (seq == lastSeq && off > lastOff) {
                        ch.lastMmapRef, lastSeq, lastOff = mmapRef, seq, off
                }
                if len(ms.ooo.oooMmappedChunks) > 0 {
                        ch.postings = append(ch.postings, seriesRef)
                        for _, c := range ms.ooo.oooMmappedChunks {
                                if c.minTime < ch.mint {
                                        ch.mint = c.minTime
                                }
                                if c.maxTime > ch.maxt {
                                        ch.maxt = c.maxTime
                                }
                        }
                }
                ms.Unlock()
        }

        return ch, nil
}

func (ch *OOOCompactionHead) Index() (IndexReader, error) {
        return NewOOOCompactionHeadIndexReader(ch), nil
}

func (ch *OOOCompactionHead) Chunks() (ChunkReader, error) {
        return NewOOOHeadChunkReader(ch.oooIR.head, ch.oooIR.mint, ch.oooIR.maxt, nil), nil
}

func (ch *OOOCompactionHead) Tombstones() (tombstones.Reader, error) {
        return tombstones.NewMemTombstones(), nil
}

var oooCompactionHeadULID = ulid.MustParse("0000000000XX000COMPACTHEAD")

func (ch *OOOCompactionHead) Meta() BlockMeta {
        return BlockMeta{
                MinTime: ch.mint,
                MaxTime: ch.maxt,
                ULID:    oooCompactionHeadULID,
                Stats: BlockStats{
                        NumSeries: uint64(len(ch.postings)),
                },
        }
}

// CloneForTimeRange clones the OOOCompactionHead such that the IndexReader and ChunkReader
// obtained from this only looks at the m-map chunks within the given time ranges while not looking
// beyond the ch.lastMmapRef.
// Only the method of BlockReader interface are valid for the cloned OOOCompactionHead.
func (ch *OOOCompactionHead) CloneForTimeRange(mint, maxt int64) *OOOCompactionHead {
        return &OOOCompactionHead{
                oooIR:       NewOOOHeadIndexReader(ch.oooIR.head, mint, maxt, 0),
                lastMmapRef: ch.lastMmapRef,
                postings:    ch.postings,
                chunkRange:  ch.chunkRange,
                mint:        ch.mint,
                maxt:        ch.maxt,
        }
}

func (ch *OOOCompactionHead) Size() int64                            { return 0 }
func (ch *OOOCompactionHead) MinTime() int64                         { return ch.mint }
func (ch *OOOCompactionHead) MaxTime() int64                         { return ch.maxt }
func (ch *OOOCompactionHead) ChunkRange() int64                      { return ch.chunkRange }
func (ch *OOOCompactionHead) LastMmapRef() chunks.ChunkDiskMapperRef { return ch.lastMmapRef }
func (ch *OOOCompactionHead) LastWBLFile() int                       { return ch.lastWBLFile }

type OOOCompactionHeadIndexReader struct {
        ch *OOOCompactionHead
}

func NewOOOCompactionHeadIndexReader(ch *OOOCompactionHead) IndexReader {
        return &OOOCompactionHeadIndexReader{ch: ch}
}

func (ir *OOOCompactionHeadIndexReader) Symbols() index.StringIter {
        return ir.ch.oooIR.Symbols()
}

func (ir *OOOCompactionHeadIndexReader) Postings(_ context.Context, name string, values ...string) (index.Postings, error) {
        n, v := index.AllPostingsKey()
        if name != n || len(values) != 1 || values[0] != v {
                return nil, errors.New("only AllPostingsKey is supported")
        }
        return index.NewListPostings(ir.ch.postings), nil
}

func (ir *OOOCompactionHeadIndexReader) PostingsForLabelMatching(context.Context, string, func(string) bool) index.Postings {
        return index.ErrPostings(errors.New("not supported"))
}

func (ir *OOOCompactionHeadIndexReader) SortedPostings(p index.Postings) index.Postings {
        // This will already be sorted from the Postings() call above.
        return p
}

func (ir *OOOCompactionHeadIndexReader) ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings {
        return ir.ch.oooIR.ShardedPostings(p, shardIndex, shardCount)
}

func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
        return ir.ch.oooIR.series(ref, builder, chks, 0, ir.ch.lastMmapRef)
}

func (ir *OOOCompactionHeadIndexReader) SortedLabelValues(_ context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
        return nil, errors.New("not implemented")
}

func (ir *OOOCompactionHeadIndexReader) LabelValues(_ context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
        return nil, errors.New("not implemented")
}

func (ir *OOOCompactionHeadIndexReader) PostingsForMatchers(_ context.Context, concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
        return nil, errors.New("not implemented")
}

func (ir *OOOCompactionHeadIndexReader) LabelNames(context.Context, ...*labels.Matcher) ([]string, error) {
        return nil, errors.New("not implemented")
}

func (ir *OOOCompactionHeadIndexReader) LabelValueFor(context.Context, storage.SeriesRef, string) (string, error) {
        return "", errors.New("not implemented")
}

func (ir *OOOCompactionHeadIndexReader) LabelNamesFor(ctx context.Context, postings index.Postings) ([]string, error) {
        return nil, errors.New("not implemented")
}

func (ir *OOOCompactionHeadIndexReader) Close() error {
        return ir.ch.oooIR.Close()
}

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "container/list"
        "sync"

        "github.com/prometheus/prometheus/tsdb/chunks"
)

type oooIsolation struct {
        mtx       sync.RWMutex
        openReads *list.List
}

type oooIsolationState struct {
        i *oooIsolation
        e *list.Element

        minRef chunks.ChunkDiskMapperRef
}

func newOOOIsolation() *oooIsolation {
        return &oooIsolation{
                openReads: list.New(),
        }
}

// HasOpenReadsAtOrBefore returns true if this oooIsolation is aware of any reads that use
// chunks with reference at or before ref.
func (i *oooIsolation) HasOpenReadsAtOrBefore(ref chunks.ChunkDiskMapperRef) bool {
        i.mtx.RLock()
        defer i.mtx.RUnlock()

        for e := i.openReads.Front(); e != nil; e = e.Next() {
                s := e.Value.(*oooIsolationState)

                if ref.GreaterThan(s.minRef) {
                        return true
                }
        }

        return false
}

// TrackReadAfter records a read that uses chunks with reference after minRef.
//
// The caller must ensure that the returned oooIsolationState is eventually closed when
// the read is complete.
func (i *oooIsolation) TrackReadAfter(minRef chunks.ChunkDiskMapperRef) *oooIsolationState {
        s := &oooIsolationState{
                i:      i,
                minRef: minRef,
        }

        i.mtx.Lock()
        s.e = i.openReads.PushBack(s)
        i.mtx.Unlock()

        return s
}

func (s oooIsolationState) Close() {
        s.i.mtx.Lock()
        s.i.openReads.Remove(s.e)
        s.i.mtx.Unlock()
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "errors"
        "fmt"
        "math"
        "slices"

        "github.com/oklog/ulid"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
        "github.com/prometheus/prometheus/tsdb/chunks"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/index"
        "github.com/prometheus/prometheus/tsdb/tombstones"
        "github.com/prometheus/prometheus/util/annotations"
)

// checkContextEveryNIterations is used in some tight loops to check if the context is done.
const checkContextEveryNIterations = 100

type blockBaseQuerier struct {
        blockID    ulid.ULID
        index      IndexReader
        chunks     ChunkReader
        tombstones tombstones.Reader

        closed bool

        mint, maxt int64
}

func newBlockBaseQuerier(b BlockReader, mint, maxt int64) (*blockBaseQuerier, error) {
        indexr, err := b.Index()
        if err != nil {
                return nil, fmt.Errorf("open index reader: %w", err)
        }
        chunkr, err := b.Chunks()
        if err != nil {
                indexr.Close()
                return nil, fmt.Errorf("open chunk reader: %w", err)
        }
        tombsr, err := b.Tombstones()
        if err != nil {
                indexr.Close()
                chunkr.Close()
                return nil, fmt.Errorf("open tombstone reader: %w", err)
        }

        if tombsr == nil {
                tombsr = tombstones.NewMemTombstones()
        }
        return &blockBaseQuerier{
                blockID:    b.Meta().ULID,
                mint:       mint,
                maxt:       maxt,
                index:      indexr,
                chunks:     chunkr,
                tombstones: tombsr,
        }, nil
}

func (q *blockBaseQuerier) LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        res, err := q.index.SortedLabelValues(ctx, name, matchers...)
        return res, nil, err
}

func (q *blockBaseQuerier) LabelNames(ctx context.Context, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) {
        res, err := q.index.LabelNames(ctx, matchers...)
        return res, nil, err
}

func (q *blockBaseQuerier) Close() error {
        if q.closed {
                return errors.New("block querier already closed")
        }

        errs := tsdb_errors.NewMulti(
                q.index.Close(),
                q.chunks.Close(),
                q.tombstones.Close(),
        )
        q.closed = true
        return errs.Err()
}

type blockQuerier struct {
        *blockBaseQuerier
}

// NewBlockQuerier returns a querier against the block reader and requested min and max time range.
func NewBlockQuerier(b BlockReader, mint, maxt int64) (storage.Querier, error) {
        q, err := newBlockBaseQuerier(b, mint, maxt)
        if err != nil {
                return nil, err
        }
        return &blockQuerier{blockBaseQuerier: q}, nil
}

func (q *blockQuerier) Select(ctx context.Context, sortSeries bool, hints *storage.SelectHints, ms ...*labels.Matcher) storage.SeriesSet {
        mint := q.mint
        maxt := q.maxt
        disableTrimming := false
        sharded := hints != nil && hints.ShardCount > 0

        p, err := PostingsForMatchers(ctx, q.index, ms...)
        if err != nil {
                return storage.ErrSeriesSet(err)
        }
        if sharded {
                p = q.index.ShardedPostings(p, hints.ShardIndex, hints.ShardCount)
        }
        if sortSeries {
                p = q.index.SortedPostings(p)
        }

        if hints != nil {
                mint = hints.Start
                maxt = hints.End
                disableTrimming = hints.DisableTrimming
                if hints.Func == "series" {
                        // When you're only looking up metadata (for example series API), you don't need to load any chunks.
                        return newBlockSeriesSet(q.index, newNopChunkReader(), q.tombstones, p, mint, maxt, disableTrimming)
                }
        }

        return newBlockSeriesSet(q.index, q.chunks, q.tombstones, p, mint, maxt, disableTrimming)
}

// blockChunkQuerier provides chunk querying access to a single block database.
type blockChunkQuerier struct {
        *blockBaseQuerier
}

// NewBlockChunkQuerier returns a chunk querier against the block reader and requested min and max time range.
func NewBlockChunkQuerier(b BlockReader, mint, maxt int64) (storage.ChunkQuerier, error) {
        q, err := newBlockBaseQuerier(b, mint, maxt)
        if err != nil {
                return nil, err
        }
        return &blockChunkQuerier{blockBaseQuerier: q}, nil
}

func (q *blockChunkQuerier) Select(ctx context.Context, sortSeries bool, hints *storage.SelectHints, ms ...*labels.Matcher) storage.ChunkSeriesSet {
        mint := q.mint
        maxt := q.maxt
        disableTrimming := false
        sharded := hints != nil && hints.ShardCount > 0

        if hints != nil {
                mint = hints.Start
                maxt = hints.End
                disableTrimming = hints.DisableTrimming
        }
        p, err := PostingsForMatchers(ctx, q.index, ms...)
        if err != nil {
                return storage.ErrChunkSeriesSet(err)
        }
        if sharded {
                p = q.index.ShardedPostings(p, hints.ShardIndex, hints.ShardCount)
        }
        if sortSeries {
                p = q.index.SortedPostings(p)
        }
        return NewBlockChunkSeriesSet(q.blockID, q.index, q.chunks, q.tombstones, p, mint, maxt, disableTrimming)
}

// PostingsForMatchers assembles a single postings iterator against the index reader
// based on the given matchers. The resulting postings are not ordered by series.
func PostingsForMatchers(ctx context.Context, ix IndexReader, ms ...*labels.Matcher) (index.Postings, error) {
        var its, notIts []index.Postings
        // See which label must be non-empty.
        // Optimization for case like {l=~".", l!="1"}.
        labelMustBeSet := make(map[string]bool, len(ms))
        for _, m := range ms {
                if !m.Matches("") {
                        labelMustBeSet[m.Name] = true
                }
        }
        isSubtractingMatcher := func(m *labels.Matcher) bool {
                if !labelMustBeSet[m.Name] {
                        return true
                }
                return (m.Type == labels.MatchNotEqual || m.Type == labels.MatchNotRegexp) && m.Matches("")
        }
        hasSubtractingMatchers, hasIntersectingMatchers := false, false
        for _, m := range ms {
                if isSubtractingMatcher(m) {
                        hasSubtractingMatchers = true
                } else {
                        hasIntersectingMatchers = true
                }
        }

        if hasSubtractingMatchers && !hasIntersectingMatchers {
                // If there's nothing to subtract from, add in everything and remove the notIts later.
                // We prefer to get AllPostings so that the base of subtraction (i.e. allPostings)
                // doesn't include series that may be added to the index reader during this function call.
                k, v := index.AllPostingsKey()
                allPostings, err := ix.Postings(ctx, k, v)
                if err != nil {
                        return nil, err
                }
                its = append(its, allPostings)
        }

        // Sort matchers to have the intersecting matchers first.
        // This way the base for subtraction is smaller and
        // there is no chance that the set we subtract from
        // contains postings of series that didn't exist when
        // we constructed the set we subtract by.
        slices.SortStableFunc(ms, func(i, j *labels.Matcher) int {
                if !isSubtractingMatcher(i) && isSubtractingMatcher(j) {
                        return -1
                }

                return +1
        })

        for _, m := range ms {
                if ctx.Err() != nil {
                        return nil, ctx.Err()
                }
                switch {
                case m.Name == "" && m.Value == "": // Special-case for AllPostings, used in tests at least.
                        k, v := index.AllPostingsKey()
                        allPostings, err := ix.Postings(ctx, k, v)
                        if err != nil {
                                return nil, err
                        }
                        its = append(its, allPostings)
                case labelMustBeSet[m.Name]:
                        // If this matcher must be non-empty, we can be smarter.
                        matchesEmpty := m.Matches("")
                        isNot := m.Type == labels.MatchNotEqual || m.Type == labels.MatchNotRegexp
                        switch {
                        case isNot && matchesEmpty: // l!="foo"
                                // If the label can't be empty and is a Not and the inner matcher
                                // doesn't match empty, then subtract it out at the end.
                                inverse, err := m.Inverse()
                                if err != nil {
                                        return nil, err
                                }

                                it, err := postingsForMatcher(ctx, ix, inverse)
                                if err != nil {
                                        return nil, err
                                }
                                notIts = append(notIts, it)
                        case isNot && !matchesEmpty: // l!=""
                                // If the label can't be empty and is a Not, but the inner matcher can
                                // be empty we need to use inversePostingsForMatcher.
                                inverse, err := m.Inverse()
                                if err != nil {
                                        return nil, err
                                }

                                it, err := inversePostingsForMatcher(ctx, ix, inverse)
                                if err != nil {
                                        return nil, err
                                }
                                if index.IsEmptyPostingsType(it) {
                                        return index.EmptyPostings(), nil
                                }
                                its = append(its, it)
                        default: // l="a"
                                // Non-Not matcher, use normal postingsForMatcher.
                                it, err := postingsForMatcher(ctx, ix, m)
                                if err != nil {
                                        return nil, err
                                }
                                if index.IsEmptyPostingsType(it) {
                                        return index.EmptyPostings(), nil
                                }
                                its = append(its, it)
                        }
                default: // l=""
                        // If the matchers for a labelname selects an empty value, it selects all
                        // the series which don't have the label name set too. See:
                        // https://github.com/prometheus/prometheus/issues/3575 and
                        // https://github.com/prometheus/prometheus/pull/3578#issuecomment-351653555
                        it, err := inversePostingsForMatcher(ctx, ix, m)
                        if err != nil {
                                return nil, err
                        }
                        notIts = append(notIts, it)
                }
        }

        it := index.Intersect(its...)

        for _, n := range notIts {
                it = index.Without(it, n)
        }

        return it, nil
}

func postingsForMatcher(ctx context.Context, ix IndexReader, m *labels.Matcher) (index.Postings, error) {
        // This method will not return postings for missing labels.

        // Fast-path for equal matching.
        if m.Type == labels.MatchEqual {
                return ix.Postings(ctx, m.Name, m.Value)
        }

        // Fast-path for set matching.
        if m.Type == labels.MatchRegexp {
                setMatches := m.SetMatches()
                if len(setMatches) > 0 {
                        return ix.Postings(ctx, m.Name, setMatches...)
                }
        }

        it := ix.PostingsForLabelMatching(ctx, m.Name, m.Matches)
        return it, it.Err()
}

// inversePostingsForMatcher returns the postings for the series with the label name set but not matching the matcher.
func inversePostingsForMatcher(ctx context.Context, ix IndexReader, m *labels.Matcher) (index.Postings, error) {
        // Fast-path for MatchNotRegexp matching.
        // Inverse of a MatchNotRegexp is MatchRegexp (double negation).
        // Fast-path for set matching.
        if m.Type == labels.MatchNotRegexp {
                setMatches := m.SetMatches()
                if len(setMatches) > 0 {
                        return ix.Postings(ctx, m.Name, setMatches...)
                }
        }

        // Fast-path for MatchNotEqual matching.
        // Inverse of a MatchNotEqual is MatchEqual (double negation).
        if m.Type == labels.MatchNotEqual {
                return ix.Postings(ctx, m.Name, m.Value)
        }

        vals, err := ix.LabelValues(ctx, m.Name)
        if err != nil {
                return nil, err
        }

        res := vals[:0]
        // If the match before inversion was !="" or !~"", we just want all the values.
        if m.Value == "" && (m.Type == labels.MatchRegexp || m.Type == labels.MatchEqual) {
                res = vals
        } else {
                count := 1
                for _, val := range vals {
                        if count%checkContextEveryNIterations == 0 && ctx.Err() != nil {
                                return nil, ctx.Err()
                        }
                        count++
                        if !m.Matches(val) {
                                res = append(res, val)
                        }
                }
        }

        return ix.Postings(ctx, m.Name, res...)
}

func labelValuesWithMatchers(ctx context.Context, r IndexReader, name string, matchers ...*labels.Matcher) ([]string, error) {
        allValues, err := r.LabelValues(ctx, name)
        if err != nil {
                return nil, fmt.Errorf("fetching values of label %s: %w", name, err)
        }

        // If we have a matcher for the label name, we can filter out values that don't match
        // before we fetch postings. This is especially useful for labels with many values.
        // e.g. __name__ with a selector like {__name__="xyz"}
        hasMatchersForOtherLabels := false
        for _, m := range matchers {
                if m.Name != name {
                        hasMatchersForOtherLabels = true
                        continue
                }

                // re-use the allValues slice to avoid allocations
                // this is safe because the iteration is always ahead of the append
                filteredValues := allValues[:0]
                count := 1
                for _, v := range allValues {
                        if count%checkContextEveryNIterations == 0 && ctx.Err() != nil {
                                return nil, ctx.Err()
                        }
                        count++
                        if m.Matches(v) {
                                filteredValues = append(filteredValues, v)
                        }
                }
                allValues = filteredValues
        }

        if len(allValues) == 0 {
                return nil, nil
        }

        // If we don't have any matchers for other labels, then we're done.
        if !hasMatchersForOtherLabels {
                return allValues, nil
        }

        p, err := PostingsForMatchers(ctx, r, matchers...)
        if err != nil {
                return nil, fmt.Errorf("fetching postings for matchers: %w", err)
        }

        valuesPostings := make([]index.Postings, len(allValues))
        for i, value := range allValues {
                valuesPostings[i], err = r.Postings(ctx, name, value)
                if err != nil {
                        return nil, fmt.Errorf("fetching postings for %s=%q: %w", name, value, err)
                }
        }
        indexes, err := index.FindIntersectingPostings(p, valuesPostings)
        if err != nil {
                return nil, fmt.Errorf("intersecting postings: %w", err)
        }

        values := make([]string, 0, len(indexes))
        for _, idx := range indexes {
                values = append(values, allValues[idx])
        }

        return values, nil
}

func labelNamesWithMatchers(ctx context.Context, r IndexReader, matchers ...*labels.Matcher) ([]string, error) {
        p, err := PostingsForMatchers(ctx, r, matchers...)
        if err != nil {
                return nil, err
        }
        return r.LabelNamesFor(ctx, p)
}

// seriesData, used inside other iterators, are updated when we move from one series to another.
type seriesData struct {
        chks      []chunks.Meta
        intervals tombstones.Intervals
        labels    labels.Labels
}

// Labels implements part of storage.Series and storage.ChunkSeries.
func (s *seriesData) Labels() labels.Labels { return s.labels }

// blockBaseSeriesSet allows to iterate over all series in the single block.
// Iterated series are trimmed with given min and max time as well as tombstones.
// See newBlockSeriesSet and NewBlockChunkSeriesSet to use it for either sample or chunk iterating.
type blockBaseSeriesSet struct {
        blockID         ulid.ULID
        p               index.Postings
        index           IndexReader
        chunks          ChunkReader
        tombstones      tombstones.Reader
        mint, maxt      int64
        disableTrimming bool

        curr seriesData

        bufChks []chunks.Meta
        builder labels.ScratchBuilder
        err     error
}

func (b *blockBaseSeriesSet) Next() bool {
        for b.p.Next() {
                if err := b.index.Series(b.p.At(), &b.builder, &b.bufChks); err != nil {
                        // Postings may be stale. Skip if no underlying series exists.
                        if errors.Is(err, storage.ErrNotFound) {
                                continue
                        }
                        b.err = fmt.Errorf("get series %d: %w", b.p.At(), err)
                        return false
                }

                if len(b.bufChks) == 0 {
                        continue
                }

                intervals, err := b.tombstones.Get(b.p.At())
                if err != nil {
                        b.err = fmt.Errorf("get tombstones: %w", err)
                        return false
                }

                // NOTE:
                // * block time range is half-open: [meta.MinTime, meta.MaxTime).
                // * chunks are both closed: [chk.MinTime, chk.MaxTime].
                // * requested time ranges are closed: [req.Start, req.End].

                var trimFront, trimBack bool

                // Copy chunks as iterables are reusable.
                // Count those in range to size allocation (roughly - ignoring tombstones).
                nChks := 0
                for _, chk := range b.bufChks {
                        if !(chk.MaxTime < b.mint || chk.MinTime > b.maxt) {
                                nChks++
                        }
                }
                chks := make([]chunks.Meta, 0, nChks)

                // Prefilter chunks and pick those which are not entirely deleted or totally outside of the requested range.
                for _, chk := range b.bufChks {
                        if chk.MaxTime < b.mint {
                                continue
                        }
                        if chk.MinTime > b.maxt {
                                continue
                        }
                        if (tombstones.Interval{Mint: chk.MinTime, Maxt: chk.MaxTime}.IsSubrange(intervals)) {
                                continue
                        }
                        chks = append(chks, chk)

                        // If still not entirely deleted, check if trim is needed based on requested time range.
                        if !b.disableTrimming {
                                if chk.MinTime < b.mint {
                                        trimFront = true
                                }
                                if chk.MaxTime > b.maxt {
                                        trimBack = true
                                }
                        }
                }

                if len(chks) == 0 {
                        continue
                }

                if trimFront {
                        intervals = intervals.Add(tombstones.Interval{Mint: math.MinInt64, Maxt: b.mint - 1})
                }
                if trimBack {
                        intervals = intervals.Add(tombstones.Interval{Mint: b.maxt + 1, Maxt: math.MaxInt64})
                }

                b.curr.labels = b.builder.Labels()
                b.curr.chks = chks
                b.curr.intervals = intervals
                return true
        }
        return false
}

func (b *blockBaseSeriesSet) Err() error {
        if b.err != nil {
                return b.err
        }
        return b.p.Err()
}

func (b *blockBaseSeriesSet) Warnings() annotations.Annotations { return nil }

// populateWithDelGenericSeriesIterator allows to iterate over given chunk
// metas. In each iteration it ensures that chunks are trimmed based on given
// tombstones interval if any.
//
// populateWithDelGenericSeriesIterator assumes that chunks that would be fully
// removed by intervals are filtered out in previous phase.
//
// On each iteration currMeta is available. If currDelIter is not nil, it
// means that the chunk in currMeta is invalid and a chunk rewrite is needed,
// for which currDelIter should be used.
type populateWithDelGenericSeriesIterator struct {
        blockID ulid.ULID
        cr      ChunkReader
        // metas are expected to be sorted by minTime and should be related to
        // the same, single series.
        // It's possible for a single chunks.Meta to refer to multiple chunks.
        // cr.ChunkOrIterator() would return an iterable and a nil chunk in this
        // case.
        metas []chunks.Meta

        i         int // Index into metas; -1 if not started yet.
        err       error
        bufIter   DeletedIterator // Retained for memory re-use. currDelIter may point here.
        intervals tombstones.Intervals

        currDelIter chunkenc.Iterator
        // currMeta is the current chunks.Meta from metas. currMeta.Chunk is set to
        // the chunk returned from cr.ChunkOrIterable(). As that can return a nil
        // chunk, currMeta.Chunk is not always guaranteed to be set.
        currMeta chunks.Meta
}

func (p *populateWithDelGenericSeriesIterator) reset(blockID ulid.ULID, cr ChunkReader, chks []chunks.Meta, intervals tombstones.Intervals) {
        p.blockID = blockID
        p.cr = cr
        p.metas = chks
        p.i = -1
        p.err = nil
        // Note we don't touch p.bufIter.Iter; it is holding on to an iterator we might reuse in next().
        p.bufIter.Intervals = p.bufIter.Intervals[:0]
        p.intervals = intervals
        p.currDelIter = nil
        p.currMeta = chunks.Meta{}
}

// If copyHeadChunk is true, then the head chunk (i.e. the in-memory chunk of the TSDB)
// is deep copied to avoid races between reads and copying chunk bytes.
// However, if the deletion intervals overlaps with the head chunk, then the head chunk is
// not copied irrespective of copyHeadChunk because it will be re-encoded later anyway.
func (p *populateWithDelGenericSeriesIterator) next(copyHeadChunk bool) bool {
        if p.err != nil || p.i >= len(p.metas)-1 {
                return false
        }

        p.i++
        p.currMeta = p.metas[p.i]

        p.bufIter.Intervals = p.bufIter.Intervals[:0]
        for _, interval := range p.intervals {
                if p.currMeta.OverlapsClosedInterval(interval.Mint, interval.Maxt) {
                        p.bufIter.Intervals = p.bufIter.Intervals.Add(interval)
                }
        }

        hcr, ok := p.cr.(*headChunkReader)
        var iterable chunkenc.Iterable
        if ok && copyHeadChunk && len(p.bufIter.Intervals) == 0 {
                // ChunkWithCopy will copy the head chunk.
                var maxt int64
                p.currMeta.Chunk, maxt, p.err = hcr.ChunkWithCopy(p.currMeta)
                // For the in-memory head chunk the index reader sets maxt as MaxInt64. We fix it here.
                p.currMeta.MaxTime = maxt
        } else {
                p.currMeta.Chunk, iterable, p.err = p.cr.ChunkOrIterable(p.currMeta)
        }

        if p.err != nil {
                p.err = fmt.Errorf("cannot populate chunk %d from block %s: %w", p.currMeta.Ref, p.blockID.String(), p.err)
                return false
        }

        // Use the single chunk if possible.
        if p.currMeta.Chunk != nil {
                if len(p.bufIter.Intervals) == 0 {
                        // If there is no overlap with deletion intervals and a single chunk is
                        // returned, we can take chunk as it is.
                        p.currDelIter = nil
                        return true
                }
                // Otherwise we need to iterate over the samples in the single chunk
                // and create new chunks.
                p.bufIter.Iter = p.currMeta.Chunk.Iterator(p.bufIter.Iter)
                p.currDelIter = &p.bufIter
                return true
        }

        // Otherwise, use the iterable to create an iterator.
        p.bufIter.Iter = iterable.Iterator(p.bufIter.Iter)
        p.currDelIter = &p.bufIter
        return true
}

func (p *populateWithDelGenericSeriesIterator) Err() error { return p.err }

type blockSeriesEntry struct {
        chunks  ChunkReader
        blockID ulid.ULID
        seriesData
}

func (s *blockSeriesEntry) Iterator(it chunkenc.Iterator) chunkenc.Iterator {
        pi, ok := it.(*populateWithDelSeriesIterator)
        if !ok {
                pi = &populateWithDelSeriesIterator{}
        }
        pi.reset(s.blockID, s.chunks, s.chks, s.intervals)
        return pi
}

type chunkSeriesEntry struct {
        chunks  ChunkReader
        blockID ulid.ULID
        seriesData
}

func (s *chunkSeriesEntry) Iterator(it chunks.Iterator) chunks.Iterator {
        pi, ok := it.(*populateWithDelChunkSeriesIterator)
        if !ok {
                pi = &populateWithDelChunkSeriesIterator{}
        }
        pi.reset(s.blockID, s.chunks, s.chks, s.intervals)
        return pi
}

// populateWithDelSeriesIterator allows to iterate over samples for the single series.
type populateWithDelSeriesIterator struct {
        populateWithDelGenericSeriesIterator

        curr chunkenc.Iterator
}

func (p *populateWithDelSeriesIterator) reset(blockID ulid.ULID, cr ChunkReader, chks []chunks.Meta, intervals tombstones.Intervals) {
        p.populateWithDelGenericSeriesIterator.reset(blockID, cr, chks, intervals)
        p.curr = nil
}

func (p *populateWithDelSeriesIterator) Next() chunkenc.ValueType {
        if p.curr != nil {
                if valueType := p.curr.Next(); valueType != chunkenc.ValNone {
                        return valueType
                }
        }

        for p.next(false) {
                if p.currDelIter != nil {
                        p.curr = p.currDelIter
                } else {
                        p.curr = p.currMeta.Chunk.Iterator(p.curr)
                }
                if valueType := p.curr.Next(); valueType != chunkenc.ValNone {
                        return valueType
                }
        }
        return chunkenc.ValNone
}

func (p *populateWithDelSeriesIterator) Seek(t int64) chunkenc.ValueType {
        if p.curr != nil {
                if valueType := p.curr.Seek(t); valueType != chunkenc.ValNone {
                        return valueType
                }
        }
        for p.Next() != chunkenc.ValNone {
                if valueType := p.curr.Seek(t); valueType != chunkenc.ValNone {
                        return valueType
                }
        }
        return chunkenc.ValNone
}

func (p *populateWithDelSeriesIterator) At() (int64, float64) {
        return p.curr.At()
}

func (p *populateWithDelSeriesIterator) AtHistogram(h *histogram.Histogram) (int64, *histogram.Histogram) {
        return p.curr.AtHistogram(h)
}

func (p *populateWithDelSeriesIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        return p.curr.AtFloatHistogram(fh)
}

func (p *populateWithDelSeriesIterator) AtT() int64 {
        return p.curr.AtT()
}

func (p *populateWithDelSeriesIterator) Err() error {
        if err := p.populateWithDelGenericSeriesIterator.Err(); err != nil {
                return err
        }
        if p.curr != nil {
                return p.curr.Err()
        }
        return nil
}

type populateWithDelChunkSeriesIterator struct {
        populateWithDelGenericSeriesIterator

        // currMetaWithChunk is current meta with its chunk field set. This meta
        // is guaranteed to map to a single chunk. This differs from
        // populateWithDelGenericSeriesIterator.currMeta as that
        // could refer to multiple chunks.
        currMetaWithChunk chunks.Meta

        // chunksFromIterable stores the chunks created from iterating through
        // the iterable returned by cr.ChunkOrIterable() (with deleted samples
        // removed).
        chunksFromIterable    []chunks.Meta
        chunksFromIterableIdx int
}

func (p *populateWithDelChunkSeriesIterator) reset(blockID ulid.ULID, cr ChunkReader, chks []chunks.Meta, intervals tombstones.Intervals) {
        p.populateWithDelGenericSeriesIterator.reset(blockID, cr, chks, intervals)
        p.currMetaWithChunk = chunks.Meta{}
        p.chunksFromIterable = p.chunksFromIterable[:0]
        p.chunksFromIterableIdx = -1
}

func (p *populateWithDelChunkSeriesIterator) Next() bool {
        if p.currMeta.Chunk == nil {
                // If we've been creating chunks from the iterable, check if there are
                // any more chunks to iterate through.
                if p.chunksFromIterableIdx < len(p.chunksFromIterable)-1 {
                        p.chunksFromIterableIdx++
                        p.currMetaWithChunk = p.chunksFromIterable[p.chunksFromIterableIdx]
                        return true
                }
        }

        // Move to the next chunk/deletion iterator.
        // This is a for loop as if the current p.currDelIter returns no samples
        // (which means a chunk won't be created), there still might be more
        // samples/chunks from the rest of p.metas.
        for p.next(true) {
                if p.currDelIter == nil {
                        p.currMetaWithChunk = p.currMeta
                        return true
                }

                if p.currMeta.Chunk != nil {
                        // If ChunkOrIterable() returned a non-nil chunk, the samples in
                        // p.currDelIter will only form one chunk, as the only change
                        // p.currDelIter might make is deleting some samples.
                        if p.populateCurrForSingleChunk() {
                                return true
                        }
                } else {
                        // If ChunkOrIterable() returned an iterable, multiple chunks may be
                        // created from the samples in p.currDelIter.
                        if p.populateChunksFromIterable() {
                                return true
                        }
                }
        }
        return false
}

// populateCurrForSingleChunk sets the fields within p.currMetaWithChunk. This
// should be called if the samples in p.currDelIter only form one chunk.
func (p *populateWithDelChunkSeriesIterator) populateCurrForSingleChunk() bool {
        valueType := p.currDelIter.Next()
        if valueType == chunkenc.ValNone {
                if err := p.currDelIter.Err(); err != nil {
                        p.err = fmt.Errorf("iterate chunk while re-encoding: %w", err)
                }
                return false
        }
        p.currMetaWithChunk.MinTime = p.currDelIter.AtT()

        // Re-encode the chunk if iterator is provided. This means that it has
        // some samples to be deleted or chunk is opened.
        var (
                newChunk chunkenc.Chunk
                app      chunkenc.Appender
                t        int64
                err      error
        )
        switch valueType {
        case chunkenc.ValHistogram:
                newChunk = chunkenc.NewHistogramChunk()
                if app, err = newChunk.Appender(); err != nil {
                        break
                }
                for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() {
                        if vt != chunkenc.ValHistogram {
                                err = fmt.Errorf("found value type %v in histogram chunk", vt)
                                break
                        }
                        var h *histogram.Histogram
                        t, h = p.currDelIter.AtHistogram(nil)
                        _, _, app, err = app.AppendHistogram(nil, t, h, true)
                        if err != nil {
                                break
                        }
                }
        case chunkenc.ValFloat:
                newChunk = chunkenc.NewXORChunk()
                if app, err = newChunk.Appender(); err != nil {
                        break
                }
                for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() {
                        if vt != chunkenc.ValFloat {
                                err = fmt.Errorf("found value type %v in float chunk", vt)
                                break
                        }
                        var v float64
                        t, v = p.currDelIter.At()
                        app.Append(t, v)
                }
        case chunkenc.ValFloatHistogram:
                newChunk = chunkenc.NewFloatHistogramChunk()
                if app, err = newChunk.Appender(); err != nil {
                        break
                }
                for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() {
                        if vt != chunkenc.ValFloatHistogram {
                                err = fmt.Errorf("found value type %v in histogram chunk", vt)
                                break
                        }
                        var h *histogram.FloatHistogram
                        t, h = p.currDelIter.AtFloatHistogram(nil)
                        _, _, app, err = app.AppendFloatHistogram(nil, t, h, true)
                        if err != nil {
                                break
                        }
                }
        default:
                err = fmt.Errorf("populateCurrForSingleChunk: value type %v unsupported", valueType)
        }

        if err != nil {
                p.err = fmt.Errorf("iterate chunk while re-encoding: %w", err)
                return false
        }
        if err := p.currDelIter.Err(); err != nil {
                p.err = fmt.Errorf("iterate chunk while re-encoding: %w", err)
                return false
        }

        p.currMetaWithChunk.Chunk = newChunk
        p.currMetaWithChunk.MaxTime = t
        return true
}

// populateChunksFromIterable reads the samples from currDelIter to create
// chunks for chunksFromIterable. It also sets p.currMetaWithChunk to the first
// chunk.
func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool {
        p.chunksFromIterable = p.chunksFromIterable[:0]
        p.chunksFromIterableIdx = -1

        firstValueType := p.currDelIter.Next()
        if firstValueType == chunkenc.ValNone {
                if err := p.currDelIter.Err(); err != nil {
                        p.err = fmt.Errorf("populateChunksFromIterable: no samples could be read: %w", err)
                        return false
                }
                return false
        }

        var (
                // t is the timestamp for the current sample.
                t     int64
                cmint int64
                cmaxt int64

                currentChunk chunkenc.Chunk

                app chunkenc.Appender

                newChunk chunkenc.Chunk
                recoded  bool

                err error
        )

        prevValueType := chunkenc.ValNone

        for currentValueType := firstValueType; currentValueType != chunkenc.ValNone; currentValueType = p.currDelIter.Next() {
                // Check if the encoding has changed (i.e. we need to create a new
                // chunk as chunks can't have multiple encoding types).
                // For the first sample, the following condition will always be true as
                // ValNoneNone != ValFloat | ValHistogram | ValFloatHistogram.
                if currentValueType != prevValueType {
                        if prevValueType != chunkenc.ValNone {
                                p.chunksFromIterable = append(p.chunksFromIterable, chunks.Meta{Chunk: currentChunk, MinTime: cmint, MaxTime: cmaxt})
                        }
                        cmint = p.currDelIter.AtT()
                        if currentChunk, err = currentValueType.NewChunk(); err != nil {
                                break
                        }
                        if app, err = currentChunk.Appender(); err != nil {
                                break
                        }
                }

                switch currentValueType {
                case chunkenc.ValFloat:
                        {
                                var v float64
                                t, v = p.currDelIter.At()
                                app.Append(t, v)
                        }
                case chunkenc.ValHistogram:
                        {
                                var v *histogram.Histogram
                                t, v = p.currDelIter.AtHistogram(nil)
                                // No need to set prevApp as AppendHistogram will set the
                                // counter reset header for the appender that's returned.
                                newChunk, recoded, app, err = app.AppendHistogram(nil, t, v, false)
                        }
                case chunkenc.ValFloatHistogram:
                        {
                                var v *histogram.FloatHistogram
                                t, v = p.currDelIter.AtFloatHistogram(nil)
                                // No need to set prevApp as AppendHistogram will set the
                                // counter reset header for the appender that's returned.
                                newChunk, recoded, app, err = app.AppendFloatHistogram(nil, t, v, false)
                        }
                }

                if err != nil {
                        break
                }

                if newChunk != nil {
                        if !recoded {
                                p.chunksFromIterable = append(p.chunksFromIterable, chunks.Meta{Chunk: currentChunk, MinTime: cmint, MaxTime: cmaxt})
                        }
                        currentChunk = newChunk
                        cmint = t
                }

                cmaxt = t
                prevValueType = currentValueType
        }

        if err != nil {
                p.err = fmt.Errorf("populateChunksFromIterable: error when writing new chunks: %w", err)
                return false
        }
        if err = p.currDelIter.Err(); err != nil {
                p.err = fmt.Errorf("populateChunksFromIterable: currDelIter error when writing new chunks: %w", err)
                return false
        }

        if prevValueType != chunkenc.ValNone {
                p.chunksFromIterable = append(p.chunksFromIterable, chunks.Meta{Chunk: currentChunk, MinTime: cmint, MaxTime: cmaxt})
        }

        if len(p.chunksFromIterable) == 0 {
                return false
        }

        p.currMetaWithChunk = p.chunksFromIterable[0]
        p.chunksFromIterableIdx = 0
        return true
}

func (p *populateWithDelChunkSeriesIterator) At() chunks.Meta { return p.currMetaWithChunk }

// blockSeriesSet allows to iterate over sorted, populated series with applied tombstones.
// Series with all deleted chunks are still present as Series with no samples.
// Samples from chunks are also trimmed to requested min and max time.
type blockSeriesSet struct {
        blockBaseSeriesSet
}

func newBlockSeriesSet(i IndexReader, c ChunkReader, t tombstones.Reader, p index.Postings, mint, maxt int64, disableTrimming bool) storage.SeriesSet {
        return &blockSeriesSet{
                blockBaseSeriesSet{
                        index:           i,
                        chunks:          c,
                        tombstones:      t,
                        p:               p,
                        mint:            mint,
                        maxt:            maxt,
                        disableTrimming: disableTrimming,
                },
        }
}

func (b *blockSeriesSet) At() storage.Series {
        // At can be looped over before iterating, so save the current values locally.
        return &blockSeriesEntry{
                chunks:     b.chunks,
                blockID:    b.blockID,
                seriesData: b.curr,
        }
}

// blockChunkSeriesSet allows to iterate over sorted, populated series with applied tombstones.
// Series with all deleted chunks are still present as Labelled iterator with no chunks.
// Chunks are also trimmed to requested [min and max] (keeping samples with min and max timestamps).
type blockChunkSeriesSet struct {
        blockBaseSeriesSet
}

func NewBlockChunkSeriesSet(id ulid.ULID, i IndexReader, c ChunkReader, t tombstones.Reader, p index.Postings, mint, maxt int64, disableTrimming bool) storage.ChunkSeriesSet {
        return &blockChunkSeriesSet{
                blockBaseSeriesSet{
                        blockID:         id,
                        index:           i,
                        chunks:          c,
                        tombstones:      t,
                        p:               p,
                        mint:            mint,
                        maxt:            maxt,
                        disableTrimming: disableTrimming,
                },
        }
}

func (b *blockChunkSeriesSet) At() storage.ChunkSeries {
        // At can be looped over before iterating, so save the current values locally.
        return &chunkSeriesEntry{
                chunks:     b.chunks,
                blockID:    b.blockID,
                seriesData: b.curr,
        }
}

// NewMergedStringIter returns string iterator that allows to merge symbols on demand and stream result.
func NewMergedStringIter(a, b index.StringIter) index.StringIter {
        return &mergedStringIter{a: a, b: b, aok: a.Next(), bok: b.Next()}
}

type mergedStringIter struct {
        a        index.StringIter
        b        index.StringIter
        aok, bok bool
        cur      string
        err      error
}

func (m *mergedStringIter) Next() bool {
        if (!m.aok && !m.bok) || (m.Err() != nil) {
                return false
        }
        switch {
        case !m.aok:
                m.cur = m.b.At()
                m.bok = m.b.Next()
                m.err = m.b.Err()
        case !m.bok:
                m.cur = m.a.At()
                m.aok = m.a.Next()
                m.err = m.a.Err()
        case m.b.At() > m.a.At():
                m.cur = m.a.At()
                m.aok = m.a.Next()
                m.err = m.a.Err()
        case m.a.At() > m.b.At():
                m.cur = m.b.At()
                m.bok = m.b.Next()
                m.err = m.b.Err()
        default: // Equal.
                m.cur = m.b.At()
                m.aok = m.a.Next()
                m.err = m.a.Err()
                m.bok = m.b.Next()
                if m.err == nil {
                        m.err = m.b.Err()
                }
        }

        return true
}
func (m mergedStringIter) At() string { return m.cur }
func (m mergedStringIter) Err() error {
        return m.err
}

// DeletedIterator wraps chunk Iterator and makes sure any deleted metrics are not returned.
type DeletedIterator struct {
        // Iter is an Iterator to be wrapped.
        Iter chunkenc.Iterator
        // Intervals are the deletion intervals.
        Intervals tombstones.Intervals
}

func (it *DeletedIterator) At() (int64, float64) {
        return it.Iter.At()
}

func (it *DeletedIterator) AtHistogram(h *histogram.Histogram) (int64, *histogram.Histogram) {
        t, h := it.Iter.AtHistogram(h)
        return t, h
}

func (it *DeletedIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) {
        t, h := it.Iter.AtFloatHistogram(fh)
        return t, h
}

func (it *DeletedIterator) AtT() int64 {
        return it.Iter.AtT()
}

func (it *DeletedIterator) Seek(t int64) chunkenc.ValueType {
        if it.Iter.Err() != nil {
                return chunkenc.ValNone
        }
        valueType := it.Iter.Seek(t)
        if valueType == chunkenc.ValNone {
                return chunkenc.ValNone
        }

        // Now double check if the entry falls into a deleted interval.
        ts := it.AtT()
        for _, itv := range it.Intervals {
                if ts < itv.Mint {
                        return valueType
                }

                if ts > itv.Maxt {
                        it.Intervals = it.Intervals[1:]
                        continue
                }

                // We're in the middle of an interval, we can now call Next().
                return it.Next()
        }

        // The timestamp is greater than all the deleted intervals.
        return valueType
}

func (it *DeletedIterator) Next() chunkenc.ValueType {
Outer:
        for valueType := it.Iter.Next(); valueType != chunkenc.ValNone; valueType = it.Iter.Next() {
                ts := it.AtT()
                for _, tr := range it.Intervals {
                        if tr.InBounds(ts) {
                                continue Outer
                        }

                        if ts <= tr.Maxt {
                                return valueType
                        }
                        it.Intervals = it.Intervals[1:]
                }
                return valueType
        }
        return chunkenc.ValNone
}

func (it *DeletedIterator) Err() error { return it.Iter.Err() }

type nopChunkReader struct {
        emptyChunk chunkenc.Chunk
}

func newNopChunkReader() ChunkReader {
        return nopChunkReader{
                emptyChunk: chunkenc.NewXORChunk(),
        }
}

func (cr nopChunkReader) ChunkOrIterable(chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
        return cr.emptyChunk, nil, nil
}

func (cr nopChunkReader) Close() error { return nil }

// Copyright 2018 The Prometheus Authors

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package record contains the various record types used for encoding various Head block data in the WAL and in-memory snapshot.
package record

import (
        "errors"
        "fmt"
        "math"

        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/model/histogram"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunks"
        "github.com/prometheus/prometheus/tsdb/encoding"
        "github.com/prometheus/prometheus/tsdb/tombstones"
)

// Type represents the data type of a record.
type Type uint8

const (
        // Unknown is returned for unrecognised WAL record types.
        Unknown Type = 255
        // Series is used to match WAL records of type Series.
        Series Type = 1
        // Samples is used to match WAL records of type Samples.
        Samples Type = 2
        // Tombstones is used to match WAL records of type Tombstones.
        Tombstones Type = 3
        // Exemplars is used to match WAL records of type Exemplars.
        Exemplars Type = 4
        // MmapMarkers is used to match OOO WBL records of type MmapMarkers.
        MmapMarkers Type = 5
        // Metadata is used to match WAL records of type Metadata.
        Metadata Type = 6
        // HistogramSamples is used to match WAL records of type Histograms.
        HistogramSamples Type = 7
        // FloatHistogramSamples is used to match WAL records of type Float Histograms.
        FloatHistogramSamples Type = 8
)

func (rt Type) String() string {
        switch rt {
        case Series:
                return "series"
        case Samples:
                return "samples"
        case Tombstones:
                return "tombstones"
        case Exemplars:
                return "exemplars"
        case HistogramSamples:
                return "histogram_samples"
        case FloatHistogramSamples:
                return "float_histogram_samples"
        case MmapMarkers:
                return "mmapmarkers"
        case Metadata:
                return "metadata"
        default:
                return "unknown"
        }
}

// MetricType represents the type of a series.
type MetricType uint8

const (
        UnknownMT       MetricType = 0
        Counter         MetricType = 1
        Gauge           MetricType = 2
        HistogramSample MetricType = 3
        GaugeHistogram  MetricType = 4
        Summary         MetricType = 5
        Info            MetricType = 6
        Stateset        MetricType = 7
)

func GetMetricType(t model.MetricType) uint8 {
        switch t {
        case model.MetricTypeCounter:
                return uint8(Counter)
        case model.MetricTypeGauge:
                return uint8(Gauge)
        case model.MetricTypeHistogram:
                return uint8(HistogramSample)
        case model.MetricTypeGaugeHistogram:
                return uint8(GaugeHistogram)
        case model.MetricTypeSummary:
                return uint8(Summary)
        case model.MetricTypeInfo:
                return uint8(Info)
        case model.MetricTypeStateset:
                return uint8(Stateset)
        default:
                return uint8(UnknownMT)
        }
}

func ToMetricType(m uint8) model.MetricType {
        switch m {
        case uint8(Counter):
                return model.MetricTypeCounter
        case uint8(Gauge):
                return model.MetricTypeGauge
        case uint8(HistogramSample):
                return model.MetricTypeHistogram
        case uint8(GaugeHistogram):
                return model.MetricTypeGaugeHistogram
        case uint8(Summary):
                return model.MetricTypeSummary
        case uint8(Info):
                return model.MetricTypeInfo
        case uint8(Stateset):
                return model.MetricTypeStateset
        default:
                return model.MetricTypeUnknown
        }
}

const (
        unitMetaName = "UNIT"
        helpMetaName = "HELP"
)

// ErrNotFound is returned if a looked up resource was not found. Duplicate ErrNotFound from head.go.
var ErrNotFound = errors.New("not found")

// RefSeries is the series labels with the series ID.
type RefSeries struct {
        Ref    chunks.HeadSeriesRef
        Labels labels.Labels
}

// RefSample is a timestamp/value pair associated with a reference to a series.
// TODO(beorn7): Perhaps make this "polymorphic", including histogram and float-histogram pointers? Then get rid of RefHistogramSample.
type RefSample struct {
        Ref chunks.HeadSeriesRef
        T   int64
        V   float64
}

// RefMetadata is the metadata associated with a series ID.
type RefMetadata struct {
        Ref  chunks.HeadSeriesRef
        Type uint8
        Unit string
        Help string
}

// RefExemplar is an exemplar with the labels, timestamp, value the exemplar was collected/observed with, and a reference to a series.
type RefExemplar struct {
        Ref    chunks.HeadSeriesRef
        T      int64
        V      float64
        Labels labels.Labels
}

// RefHistogramSample is a histogram.
type RefHistogramSample struct {
        Ref chunks.HeadSeriesRef
        T   int64
        H   *histogram.Histogram
}

// RefFloatHistogramSample is a float histogram.
type RefFloatHistogramSample struct {
        Ref chunks.HeadSeriesRef
        T   int64
        FH  *histogram.FloatHistogram
}

// RefMmapMarker marks that the all the samples of the given series until now have been m-mapped to disk.
type RefMmapMarker struct {
        Ref     chunks.HeadSeriesRef
        MmapRef chunks.ChunkDiskMapperRef
}

// Decoder decodes series, sample, metadata and tombstone records.
type Decoder struct {
        builder labels.ScratchBuilder
}

func NewDecoder(t *labels.SymbolTable) Decoder { // FIXME remove t
        return Decoder{builder: labels.NewScratchBuilder(0)}
}

// Type returns the type of the record.
// Returns RecordUnknown if no valid record type is found.
func (d *Decoder) Type(rec []byte) Type {
        if len(rec) < 1 {
                return Unknown
        }
        switch t := Type(rec[0]); t {
        case Series, Samples, Tombstones, Exemplars, MmapMarkers, Metadata, HistogramSamples, FloatHistogramSamples:
                return t
        }
        return Unknown
}

// Series appends series in rec to the given slice.
func (d *Decoder) Series(rec []byte, series []RefSeries) ([]RefSeries, error) {
        dec := encoding.Decbuf{B: rec}

        if Type(dec.Byte()) != Series {
                return nil, errors.New("invalid record type")
        }
        for len(dec.B) > 0 && dec.Err() == nil {
                ref := storage.SeriesRef(dec.Be64())
                lset := d.DecodeLabels(&dec)

                series = append(series, RefSeries{
                        Ref:    chunks.HeadSeriesRef(ref),
                        Labels: lset,
                })
        }
        if dec.Err() != nil {
                return nil, dec.Err()
        }
        if len(dec.B) > 0 {
                return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
        }
        return series, nil
}

// Metadata appends metadata in rec to the given slice.
func (d *Decoder) Metadata(rec []byte, metadata []RefMetadata) ([]RefMetadata, error) {
        dec := encoding.Decbuf{B: rec}

        if Type(dec.Byte()) != Metadata {
                return nil, errors.New("invalid record type")
        }
        for len(dec.B) > 0 && dec.Err() == nil {
                ref := dec.Uvarint64()
                typ := dec.Byte()
                numFields := dec.Uvarint()

                // We're currently aware of two more metadata fields other than TYPE; that is UNIT and HELP.
                // We can skip the rest of the fields (if we encounter any), but we must decode them anyway
                // so we can correctly align with the start with the next metadata record.
                var unit, help string
                for i := 0; i < numFields; i++ {
                        fieldName := dec.UvarintStr()
                        fieldValue := dec.UvarintStr()
                        switch fieldName {
                        case unitMetaName:
                                unit = fieldValue
                        case helpMetaName:
                                help = fieldValue
                        }
                }

                metadata = append(metadata, RefMetadata{
                        Ref:  chunks.HeadSeriesRef(ref),
                        Type: typ,
                        Unit: unit,
                        Help: help,
                })
        }
        if dec.Err() != nil {
                return nil, dec.Err()
        }
        if len(dec.B) > 0 {
                return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
        }
        return metadata, nil
}

// DecodeLabels decodes one set of labels from buf.
func (d *Decoder) DecodeLabels(dec *encoding.Decbuf) labels.Labels {
        d.builder.Reset()
        nLabels := dec.Uvarint()
        for i := 0; i < nLabels; i++ {
                lName := dec.UvarintBytes()
                lValue := dec.UvarintBytes()
                d.builder.UnsafeAddBytes(lName, lValue)
        }
        return d.builder.Labels()
}

// Samples appends samples in rec to the given slice.
func (d *Decoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) {
        dec := encoding.Decbuf{B: rec}

        if Type(dec.Byte()) != Samples {
                return nil, errors.New("invalid record type")
        }
        if dec.Len() == 0 {
                return samples, nil
        }
        var (
                baseRef  = dec.Be64()
                baseTime = dec.Be64int64()
        )
        // Allow 1 byte for each varint and 8 for the value; the output slice must be at least that big.
        if minSize := dec.Len() / (1 + 1 + 8); cap(samples) < minSize {
                samples = make([]RefSample, 0, minSize)
        }
        for len(dec.B) > 0 && dec.Err() == nil {
                dref := dec.Varint64()
                dtime := dec.Varint64()
                val := dec.Be64()

                samples = append(samples, RefSample{
                        Ref: chunks.HeadSeriesRef(int64(baseRef) + dref),
                        T:   baseTime + dtime,
                        V:   math.Float64frombits(val),
                })
        }

        if dec.Err() != nil {
                return nil, fmt.Errorf("decode error after %d samples: %w", len(samples), dec.Err())
        }
        if len(dec.B) > 0 {
                return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
        }
        return samples, nil
}

// Tombstones appends tombstones in rec to the given slice.
func (d *Decoder) Tombstones(rec []byte, tstones []tombstones.Stone) ([]tombstones.Stone, error) {
        dec := encoding.Decbuf{B: rec}

        if Type(dec.Byte()) != Tombstones {
                return nil, errors.New("invalid record type")
        }
        for dec.Len() > 0 && dec.Err() == nil {
                tstones = append(tstones, tombstones.Stone{
                        Ref: storage.SeriesRef(dec.Be64()),
                        Intervals: tombstones.Intervals{
                                {Mint: dec.Varint64(), Maxt: dec.Varint64()},
                        },
                })
        }
        if dec.Err() != nil {
                return nil, dec.Err()
        }
        if len(dec.B) > 0 {
                return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
        }
        return tstones, nil
}

func (d *Decoder) Exemplars(rec []byte, exemplars []RefExemplar) ([]RefExemplar, error) {
        dec := encoding.Decbuf{B: rec}
        t := Type(dec.Byte())
        if t != Exemplars {
                return nil, errors.New("invalid record type")
        }

        return d.ExemplarsFromBuffer(&dec, exemplars)
}

func (d *Decoder) ExemplarsFromBuffer(dec *encoding.Decbuf, exemplars []RefExemplar) ([]RefExemplar, error) {
        if dec.Len() == 0 {
                return exemplars, nil
        }
        var (
                baseRef  = dec.Be64()
                baseTime = dec.Be64int64()
        )
        for len(dec.B) > 0 && dec.Err() == nil {
                dref := dec.Varint64()
                dtime := dec.Varint64()
                val := dec.Be64()
                lset := d.DecodeLabels(dec)

                exemplars = append(exemplars, RefExemplar{
                        Ref:    chunks.HeadSeriesRef(baseRef + uint64(dref)),
                        T:      baseTime + dtime,
                        V:      math.Float64frombits(val),
                        Labels: lset,
                })
        }

        if dec.Err() != nil {
                return nil, fmt.Errorf("decode error after %d exemplars: %w", len(exemplars), dec.Err())
        }
        if len(dec.B) > 0 {
                return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
        }
        return exemplars, nil
}

func (d *Decoder) MmapMarkers(rec []byte, markers []RefMmapMarker) ([]RefMmapMarker, error) {
        dec := encoding.Decbuf{B: rec}
        t := Type(dec.Byte())
        if t != MmapMarkers {
                return nil, errors.New("invalid record type")
        }

        if dec.Len() == 0 {
                return markers, nil
        }
        for len(dec.B) > 0 && dec.Err() == nil {
                ref := chunks.HeadSeriesRef(dec.Be64())
                mmapRef := chunks.ChunkDiskMapperRef(dec.Be64())
                markers = append(markers, RefMmapMarker{
                        Ref:     ref,
                        MmapRef: mmapRef,
                })
        }

        if dec.Err() != nil {
                return nil, fmt.Errorf("decode error after %d mmap markers: %w", len(markers), dec.Err())
        }
        if len(dec.B) > 0 {
                return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
        }
        return markers, nil
}

func (d *Decoder) HistogramSamples(rec []byte, histograms []RefHistogramSample) ([]RefHistogramSample, error) {
        dec := encoding.Decbuf{B: rec}
        t := Type(dec.Byte())
        if t != HistogramSamples {
                return nil, errors.New("invalid record type")
        }
        if dec.Len() == 0 {
                return histograms, nil
        }
        var (
                baseRef  = dec.Be64()
                baseTime = dec.Be64int64()
        )
        for len(dec.B) > 0 && dec.Err() == nil {
                dref := dec.Varint64()
                dtime := dec.Varint64()

                rh := RefHistogramSample{
                        Ref: chunks.HeadSeriesRef(baseRef + uint64(dref)),
                        T:   baseTime + dtime,
                        H:   &histogram.Histogram{},
                }

                DecodeHistogram(&dec, rh.H)
                histograms = append(histograms, rh)
        }

        if dec.Err() != nil {
                return nil, fmt.Errorf("decode error after %d histograms: %w", len(histograms), dec.Err())
        }
        if len(dec.B) > 0 {
                return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
        }
        return histograms, nil
}

// DecodeHistogram decodes a Histogram from a byte slice.
func DecodeHistogram(buf *encoding.Decbuf, h *histogram.Histogram) {
        h.CounterResetHint = histogram.CounterResetHint(buf.Byte())

        h.Schema = int32(buf.Varint64())
        h.ZeroThreshold = math.Float64frombits(buf.Be64())

        h.ZeroCount = buf.Uvarint64()
        h.Count = buf.Uvarint64()
        h.Sum = math.Float64frombits(buf.Be64())

        l := buf.Uvarint()
        if l > 0 {
                h.PositiveSpans = make([]histogram.Span, l)
        }
        for i := range h.PositiveSpans {
                h.PositiveSpans[i].Offset = int32(buf.Varint64())
                h.PositiveSpans[i].Length = buf.Uvarint32()
        }

        l = buf.Uvarint()
        if l > 0 {
                h.NegativeSpans = make([]histogram.Span, l)
        }
        for i := range h.NegativeSpans {
                h.NegativeSpans[i].Offset = int32(buf.Varint64())
                h.NegativeSpans[i].Length = buf.Uvarint32()
        }

        l = buf.Uvarint()
        if l > 0 {
                h.PositiveBuckets = make([]int64, l)
        }
        for i := range h.PositiveBuckets {
                h.PositiveBuckets[i] = buf.Varint64()
        }

        l = buf.Uvarint()
        if l > 0 {
                h.NegativeBuckets = make([]int64, l)
        }
        for i := range h.NegativeBuckets {
                h.NegativeBuckets[i] = buf.Varint64()
        }
}

func (d *Decoder) FloatHistogramSamples(rec []byte, histograms []RefFloatHistogramSample) ([]RefFloatHistogramSample, error) {
        dec := encoding.Decbuf{B: rec}
        t := Type(dec.Byte())
        if t != FloatHistogramSamples {
                return nil, errors.New("invalid record type")
        }
        if dec.Len() == 0 {
                return histograms, nil
        }
        var (
                baseRef  = dec.Be64()
                baseTime = dec.Be64int64()
        )
        for len(dec.B) > 0 && dec.Err() == nil {
                dref := dec.Varint64()
                dtime := dec.Varint64()

                rh := RefFloatHistogramSample{
                        Ref: chunks.HeadSeriesRef(baseRef + uint64(dref)),
                        T:   baseTime + dtime,
                        FH:  &histogram.FloatHistogram{},
                }

                DecodeFloatHistogram(&dec, rh.FH)
                histograms = append(histograms, rh)
        }

        if dec.Err() != nil {
                return nil, fmt.Errorf("decode error after %d histograms: %w", len(histograms), dec.Err())
        }
        if len(dec.B) > 0 {
                return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
        }
        return histograms, nil
}

// Decode decodes a Histogram from a byte slice.
func DecodeFloatHistogram(buf *encoding.Decbuf, fh *histogram.FloatHistogram) {
        fh.CounterResetHint = histogram.CounterResetHint(buf.Byte())

        fh.Schema = int32(buf.Varint64())
        fh.ZeroThreshold = buf.Be64Float64()

        fh.ZeroCount = buf.Be64Float64()
        fh.Count = buf.Be64Float64()
        fh.Sum = buf.Be64Float64()

        l := buf.Uvarint()
        if l > 0 {
                fh.PositiveSpans = make([]histogram.Span, l)
        }
        for i := range fh.PositiveSpans {
                fh.PositiveSpans[i].Offset = int32(buf.Varint64())
                fh.PositiveSpans[i].Length = buf.Uvarint32()
        }

        l = buf.Uvarint()
        if l > 0 {
                fh.NegativeSpans = make([]histogram.Span, l)
        }
        for i := range fh.NegativeSpans {
                fh.NegativeSpans[i].Offset = int32(buf.Varint64())
                fh.NegativeSpans[i].Length = buf.Uvarint32()
        }

        l = buf.Uvarint()
        if l > 0 {
                fh.PositiveBuckets = make([]float64, l)
        }
        for i := range fh.PositiveBuckets {
                fh.PositiveBuckets[i] = buf.Be64Float64()
        }

        l = buf.Uvarint()
        if l > 0 {
                fh.NegativeBuckets = make([]float64, l)
        }
        for i := range fh.NegativeBuckets {
                fh.NegativeBuckets[i] = buf.Be64Float64()
        }
}

// Encoder encodes series, sample, and tombstones records.
// The zero value is ready to use.
type Encoder struct{}

// Series appends the encoded series to b and returns the resulting slice.
func (e *Encoder) Series(series []RefSeries, b []byte) []byte {
        buf := encoding.Encbuf{B: b}
        buf.PutByte(byte(Series))

        for _, s := range series {
                buf.PutBE64(uint64(s.Ref))
                EncodeLabels(&buf, s.Labels)
        }
        return buf.Get()
}

// Metadata appends the encoded metadata to b and returns the resulting slice.
func (e *Encoder) Metadata(metadata []RefMetadata, b []byte) []byte {
        buf := encoding.Encbuf{B: b}
        buf.PutByte(byte(Metadata))

        for _, m := range metadata {
                buf.PutUvarint64(uint64(m.Ref))

                buf.PutByte(m.Type)

                buf.PutUvarint(2) // num_fields: We currently have two more metadata fields, UNIT and HELP.
                buf.PutUvarintStr(unitMetaName)
                buf.PutUvarintStr(m.Unit)
                buf.PutUvarintStr(helpMetaName)
                buf.PutUvarintStr(m.Help)
        }

        return buf.Get()
}

// EncodeLabels encodes the contents of labels into buf.
func EncodeLabels(buf *encoding.Encbuf, lbls labels.Labels) {
        // TODO: reconsider if this function could be pushed down into labels.Labels to be more efficient.
        buf.PutUvarint(lbls.Len())

        lbls.Range(func(l labels.Label) {
                buf.PutUvarintStr(l.Name)
                buf.PutUvarintStr(l.Value)
        })
}

// Samples appends the encoded samples to b and returns the resulting slice.
func (e *Encoder) Samples(samples []RefSample, b []byte) []byte {
        buf := encoding.Encbuf{B: b}
        buf.PutByte(byte(Samples))

        if len(samples) == 0 {
                return buf.Get()
        }

        // Store base timestamp and base reference number of first sample.
        // All samples encode their timestamp and ref as delta to those.
        first := samples[0]

        buf.PutBE64(uint64(first.Ref))
        buf.PutBE64int64(first.T)

        for _, s := range samples {
                buf.PutVarint64(int64(s.Ref) - int64(first.Ref))
                buf.PutVarint64(s.T - first.T)
                buf.PutBE64(math.Float64bits(s.V))
        }
        return buf.Get()
}

// Tombstones appends the encoded tombstones to b and returns the resulting slice.
func (e *Encoder) Tombstones(tstones []tombstones.Stone, b []byte) []byte {
        buf := encoding.Encbuf{B: b}
        buf.PutByte(byte(Tombstones))

        for _, s := range tstones {
                for _, iv := range s.Intervals {
                        buf.PutBE64(uint64(s.Ref))
                        buf.PutVarint64(iv.Mint)
                        buf.PutVarint64(iv.Maxt)
                }
        }
        return buf.Get()
}

func (e *Encoder) Exemplars(exemplars []RefExemplar, b []byte) []byte {
        buf := encoding.Encbuf{B: b}
        buf.PutByte(byte(Exemplars))

        if len(exemplars) == 0 {
                return buf.Get()
        }

        e.EncodeExemplarsIntoBuffer(exemplars, &buf)

        return buf.Get()
}

func (e *Encoder) EncodeExemplarsIntoBuffer(exemplars []RefExemplar, buf *encoding.Encbuf) {
        // Store base timestamp and base reference number of first sample.
        // All samples encode their timestamp and ref as delta to those.
        first := exemplars[0]

        buf.PutBE64(uint64(first.Ref))
        buf.PutBE64int64(first.T)

        for _, ex := range exemplars {
                buf.PutVarint64(int64(ex.Ref) - int64(first.Ref))
                buf.PutVarint64(ex.T - first.T)
                buf.PutBE64(math.Float64bits(ex.V))
                EncodeLabels(buf, ex.Labels)
        }
}

func (e *Encoder) MmapMarkers(markers []RefMmapMarker, b []byte) []byte {
        buf := encoding.Encbuf{B: b}
        buf.PutByte(byte(MmapMarkers))

        for _, s := range markers {
                buf.PutBE64(uint64(s.Ref))
                buf.PutBE64(uint64(s.MmapRef))
        }

        return buf.Get()
}

func (e *Encoder) HistogramSamples(histograms []RefHistogramSample, b []byte) []byte {
        buf := encoding.Encbuf{B: b}
        buf.PutByte(byte(HistogramSamples))

        if len(histograms) == 0 {
                return buf.Get()
        }

        // Store base timestamp and base reference number of first histogram.
        // All histograms encode their timestamp and ref as delta to those.
        first := histograms[0]
        buf.PutBE64(uint64(first.Ref))
        buf.PutBE64int64(first.T)

        for _, h := range histograms {
                buf.PutVarint64(int64(h.Ref) - int64(first.Ref))
                buf.PutVarint64(h.T - first.T)

                EncodeHistogram(&buf, h.H)
        }

        return buf.Get()
}

// EncodeHistogram encodes a Histogram into a byte slice.
func EncodeHistogram(buf *encoding.Encbuf, h *histogram.Histogram) {
        buf.PutByte(byte(h.CounterResetHint))

        buf.PutVarint64(int64(h.Schema))
        buf.PutBE64(math.Float64bits(h.ZeroThreshold))

        buf.PutUvarint64(h.ZeroCount)
        buf.PutUvarint64(h.Count)
        buf.PutBE64(math.Float64bits(h.Sum))

        buf.PutUvarint(len(h.PositiveSpans))
        for _, s := range h.PositiveSpans {
                buf.PutVarint64(int64(s.Offset))
                buf.PutUvarint32(s.Length)
        }

        buf.PutUvarint(len(h.NegativeSpans))
        for _, s := range h.NegativeSpans {
                buf.PutVarint64(int64(s.Offset))
                buf.PutUvarint32(s.Length)
        }

        buf.PutUvarint(len(h.PositiveBuckets))
        for _, b := range h.PositiveBuckets {
                buf.PutVarint64(b)
        }

        buf.PutUvarint(len(h.NegativeBuckets))
        for _, b := range h.NegativeBuckets {
                buf.PutVarint64(b)
        }
}

func (e *Encoder) FloatHistogramSamples(histograms []RefFloatHistogramSample, b []byte) []byte {
        buf := encoding.Encbuf{B: b}
        buf.PutByte(byte(FloatHistogramSamples))

        if len(histograms) == 0 {
                return buf.Get()
        }

        // Store base timestamp and base reference number of first histogram.
        // All histograms encode their timestamp and ref as delta to those.
        first := histograms[0]
        buf.PutBE64(uint64(first.Ref))
        buf.PutBE64int64(first.T)

        for _, h := range histograms {
                buf.PutVarint64(int64(h.Ref) - int64(first.Ref))
                buf.PutVarint64(h.T - first.T)

                EncodeFloatHistogram(&buf, h.FH)
        }

        return buf.Get()
}

// EncodeFloatHistogram encodes the Float Histogram into a byte slice.
func EncodeFloatHistogram(buf *encoding.Encbuf, h *histogram.FloatHistogram) {
        buf.PutByte(byte(h.CounterResetHint))

        buf.PutVarint64(int64(h.Schema))
        buf.PutBEFloat64(h.ZeroThreshold)

        buf.PutBEFloat64(h.ZeroCount)
        buf.PutBEFloat64(h.Count)
        buf.PutBEFloat64(h.Sum)

        buf.PutUvarint(len(h.PositiveSpans))
        for _, s := range h.PositiveSpans {
                buf.PutVarint64(int64(s.Offset))
                buf.PutUvarint32(s.Length)
        }

        buf.PutUvarint(len(h.NegativeSpans))
        for _, s := range h.NegativeSpans {
                buf.PutVarint64(int64(s.Offset))
                buf.PutUvarint32(s.Length)
        }

        buf.PutUvarint(len(h.PositiveBuckets))
        for _, b := range h.PositiveBuckets {
                buf.PutBEFloat64(b)
        }

        buf.PutUvarint(len(h.NegativeBuckets))
        for _, b := range h.NegativeBuckets {
                buf.PutBEFloat64(b)
        }
}

// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "encoding/json"
        "fmt"
        "io"
        "os"
        "path/filepath"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"

        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
)

// repairBadIndexVersion repairs an issue in index and meta.json persistence introduced in
// commit 129773b41a565fde5156301e37f9a87158030443.
func repairBadIndexVersion(logger log.Logger, dir string) error {
        // All blocks written by Prometheus 2.1 with a meta.json version of 2 are affected.
        // We must actually set the index file version to 2 and revert the meta.json version back to 1.
        dirs, err := blockDirs(dir)
        if err != nil {
                return fmt.Errorf("list block dirs in %q: %w", dir, err)
        }

        tmpFiles := make([]string, 0, len(dirs))
        defer func() {
                for _, tmp := range tmpFiles {
                        if err := os.RemoveAll(tmp); err != nil {
                                level.Error(logger).Log("msg", "remove tmp file", "err", err.Error())
                        }
                }
        }()

        for _, d := range dirs {
                meta, err := readBogusMetaFile(d)
                if err != nil {
                        level.Error(logger).Log("msg", "failed to read meta.json for a block during repair process; skipping", "dir", d, "err", err)
                        continue
                }
                if meta.Version == metaVersion1 {
                        level.Info(logger).Log(
                                "msg", "Found healthy block",
                                "mint", meta.MinTime,
                                "maxt", meta.MaxTime,
                                "ulid", meta.ULID,
                        )
                        continue
                }
                level.Info(logger).Log(
                        "msg", "Fixing broken block",
                        "mint", meta.MinTime,
                        "maxt", meta.MaxTime,
                        "ulid", meta.ULID,
                )

                repl, err := os.Create(filepath.Join(d, "index.repaired"))
                if err != nil {
                        return fmt.Errorf("create index.repaired for block dir: %v: %w", d, err)
                }
                tmpFiles = append(tmpFiles, repl.Name())

                broken, err := os.Open(filepath.Join(d, indexFilename))
                if err != nil {
                        return fmt.Errorf("open broken index for block dir: %v: %w", d, err)
                }
                if _, err := io.Copy(repl, broken); err != nil {
                        return fmt.Errorf("copy content of index to index.repaired for block dir: %v: %w", d, err)
                }

                // Set the 5th byte to 2 to indicate the correct file format version.
                if _, err := repl.WriteAt([]byte{2}, 4); err != nil {
                        errs := tsdb_errors.NewMulti(
                                fmt.Errorf("rewrite of index.repaired for block dir: %v: %w", d, err))
                        if err := repl.Close(); err != nil {
                                errs.Add(fmt.Errorf("close: %w", err))
                        }
                        return errs.Err()
                }
                if err := repl.Sync(); err != nil {
                        errs := tsdb_errors.NewMulti(
                                fmt.Errorf("sync of index.repaired for block dir: %v: %w", d, err))
                        if err := repl.Close(); err != nil {
                                errs.Add(fmt.Errorf("close: %w", err))
                        }
                        return errs.Err()
                }
                if err := repl.Close(); err != nil {
                        return fmt.Errorf("close repaired index for block dir: %v: %w", d, err)
                }
                if err := broken.Close(); err != nil {
                        if err := repl.Close(); err != nil {
                                return fmt.Errorf("close broken index for block dir: %v: %w", d, err)
                        }
                }
                if err := fileutil.Replace(repl.Name(), broken.Name()); err != nil {
                        if err := repl.Close(); err != nil {
                                return fmt.Errorf("replaced broken index with index.repaired for block dir: %v: %w", d, err)
                        }
                }
                // Reset version of meta.json to 1.
                meta.Version = metaVersion1
                if _, err := writeMetaFile(logger, d, meta); err != nil {
                        if err := repl.Close(); err != nil {
                                return fmt.Errorf("write meta for block dir: %v: %w", d, err)
                        }
                }
        }
        return nil
}

func readBogusMetaFile(dir string) (*BlockMeta, error) {
        b, err := os.ReadFile(filepath.Join(dir, metaFilename))
        if err != nil {
                return nil, err
        }
        var m BlockMeta

        if err := json.Unmarshal(b, &m); err != nil {
                return nil, err
        }
        if m.Version != metaVersion1 && m.Version != 2 {
                return nil, fmt.Errorf("unexpected meta file version %d", m.Version)
        }
        return &m, nil
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tombstones

import (
        "encoding/binary"
        "errors"
        "fmt"
        "hash"
        "hash/crc32"
        "math"
        "os"
        "path/filepath"
        "sort"
        "sync"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"

        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/encoding"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
)

const TombstonesFilename = "tombstones"

const (
        // MagicTombstone is 4 bytes at the head of a tombstone file.
        MagicTombstone = 0x0130BA30

        tombstoneFormatV1          = 1
        tombstoneFormatVersionSize = 1
        tombstonesHeaderSize       = 5
        tombstonesCRCSize          = 4
)

// The table gets initialized with sync.Once but may still cause a race
// with any other use of the crc32 package anywhere. Thus we initialize it
// before.
var castagnoliTable *crc32.Table

func init() {
        castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
}

// newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
// polynomial may be easily changed in one location at a later time, if necessary.
func newCRC32() hash.Hash32 {
        return crc32.New(castagnoliTable)
}

// Reader gives access to tombstone intervals by series reference.
type Reader interface {
        // Get returns deletion intervals for the series with the given reference.
        Get(ref storage.SeriesRef) (Intervals, error)

        // Iter calls the given function for each encountered interval.
        Iter(func(storage.SeriesRef, Intervals) error) error

        // Total returns the total count of tombstones.
        Total() uint64

        // Close any underlying resources
        Close() error
}

func WriteFile(logger log.Logger, dir string, tr Reader) (int64, error) {
        path := filepath.Join(dir, TombstonesFilename)
        tmp := path + ".tmp"
        hash := newCRC32()
        var size int

        f, err := os.Create(tmp)
        if err != nil {
                return 0, err
        }
        defer func() {
                if f != nil {
                        if err := f.Close(); err != nil {
                                level.Error(logger).Log("msg", "close tmp file", "err", err.Error())
                        }
                }
                if err := os.RemoveAll(tmp); err != nil {
                        level.Error(logger).Log("msg", "remove tmp file", "err", err.Error())
                }
        }()

        buf := encoding.Encbuf{B: make([]byte, 3*binary.MaxVarintLen64)}
        buf.Reset()
        // Write the meta.
        buf.PutBE32(MagicTombstone)
        n, err := f.Write(buf.Get())
        if err != nil {
                return 0, err
        }
        size += n

        bytes, err := Encode(tr)
        if err != nil {
                return 0, fmt.Errorf("encoding tombstones: %w", err)
        }

        // Ignore first byte which is the format type. We do this for compatibility.
        if _, err := hash.Write(bytes[tombstoneFormatVersionSize:]); err != nil {
                return 0, fmt.Errorf("calculating hash for tombstones: %w", err)
        }

        n, err = f.Write(bytes)
        if err != nil {
                return 0, fmt.Errorf("writing tombstones: %w", err)
        }
        size += n

        n, err = f.Write(hash.Sum(nil))
        if err != nil {
                return 0, err
        }
        size += n

        if err := f.Sync(); err != nil {
                return 0, tsdb_errors.NewMulti(err, f.Close()).Err()
        }

        if err = f.Close(); err != nil {
                return 0, err
        }
        f = nil
        return int64(size), fileutil.Replace(tmp, path)
}

// Encode encodes the tombstones from the reader.
// It does not attach any magic number or checksum.
func Encode(tr Reader) ([]byte, error) {
        buf := encoding.Encbuf{}
        buf.PutByte(tombstoneFormatV1)
        err := tr.Iter(func(ref storage.SeriesRef, ivs Intervals) error {
                for _, iv := range ivs {
                        buf.PutUvarint64(uint64(ref))
                        buf.PutVarint64(iv.Mint)
                        buf.PutVarint64(iv.Maxt)
                }
                return nil
        })
        return buf.Get(), err
}

// Decode decodes the tombstones from the bytes
// which was encoded using the Encode method.
func Decode(b []byte) (Reader, error) {
        d := &encoding.Decbuf{B: b}
        if flag := d.Byte(); flag != tombstoneFormatV1 {
                return nil, fmt.Errorf("invalid tombstone format %x", flag)
        }

        if d.Err() != nil {
                return nil, d.Err()
        }

        stonesMap := NewMemTombstones()
        for d.Len() > 0 {
                k := storage.SeriesRef(d.Uvarint64())
                mint := d.Varint64()
                maxt := d.Varint64()
                if d.Err() != nil {
                        return nil, d.Err()
                }

                stonesMap.AddInterval(k, Interval{mint, maxt})
        }
        return stonesMap, nil
}

// Stone holds the information on the posting and time-range
// that is deleted.
type Stone struct {
        Ref       storage.SeriesRef
        Intervals Intervals
}

func ReadTombstones(dir string) (Reader, int64, error) {
        b, err := os.ReadFile(filepath.Join(dir, TombstonesFilename))
        switch {
        case os.IsNotExist(err):
                return NewMemTombstones(), 0, nil
        case err != nil:
                return nil, 0, err
        }

        if len(b) < tombstonesHeaderSize {
                return nil, 0, fmt.Errorf("tombstones header: %w", encoding.ErrInvalidSize)
        }

        d := &encoding.Decbuf{B: b[:len(b)-tombstonesCRCSize]}
        if mg := d.Be32(); mg != MagicTombstone {
                return nil, 0, fmt.Errorf("invalid magic number %x", mg)
        }

        // Verify checksum.
        hash := newCRC32()
        // Ignore first byte which is the format type.
        if _, err := hash.Write(d.Get()[tombstoneFormatVersionSize:]); err != nil {
                return nil, 0, fmt.Errorf("write to hash: %w", err)
        }
        if binary.BigEndian.Uint32(b[len(b)-tombstonesCRCSize:]) != hash.Sum32() {
                return nil, 0, errors.New("checksum did not match")
        }

        if d.Err() != nil {
                return nil, 0, d.Err()
        }

        stonesMap, err := Decode(d.Get())
        if err != nil {
                return nil, 0, err
        }

        return stonesMap, int64(len(b)), nil
}

type MemTombstones struct {
        intvlGroups map[storage.SeriesRef]Intervals
        mtx         sync.RWMutex
}

// NewMemTombstones creates new in memory Tombstone Reader
// that allows adding new intervals.
func NewMemTombstones() *MemTombstones {
        return &MemTombstones{intvlGroups: make(map[storage.SeriesRef]Intervals)}
}

func NewTestMemTombstones(intervals []Intervals) *MemTombstones {
        ret := NewMemTombstones()
        for i, intervalsGroup := range intervals {
                for _, interval := range intervalsGroup {
                        ret.AddInterval(storage.SeriesRef(i+1), interval)
                }
        }
        return ret
}

func (t *MemTombstones) Get(ref storage.SeriesRef) (Intervals, error) {
        t.mtx.RLock()
        defer t.mtx.RUnlock()
        intervals, ok := t.intvlGroups[ref]
        if !ok {
                return nil, nil
        }
        // Make a copy to avoid race.
        res := make(Intervals, len(intervals))
        copy(res, intervals)
        return res, nil
}

func (t *MemTombstones) DeleteTombstones(refs map[storage.SeriesRef]struct{}) {
        t.mtx.Lock()
        defer t.mtx.Unlock()
        for ref := range refs {
                delete(t.intvlGroups, ref)
        }
}

func (t *MemTombstones) TruncateBefore(beforeT int64) {
        t.mtx.Lock()
        defer t.mtx.Unlock()
        for ref, ivs := range t.intvlGroups {
                i := len(ivs) - 1
                for ; i >= 0; i-- {
                        if beforeT > ivs[i].Maxt {
                                break
                        }
                }
                if len(ivs[i+1:]) == 0 {
                        delete(t.intvlGroups, ref)
                } else {
                        newIvs := make(Intervals, len(ivs[i+1:]))
                        copy(newIvs, ivs[i+1:])
                        t.intvlGroups[ref] = newIvs
                }
        }
}

func (t *MemTombstones) Iter(f func(storage.SeriesRef, Intervals) error) error {
        t.mtx.RLock()
        defer t.mtx.RUnlock()
        for ref, ivs := range t.intvlGroups {
                if err := f(ref, ivs); err != nil {
                        return err
                }
        }
        return nil
}

func (t *MemTombstones) Total() uint64 {
        t.mtx.RLock()
        defer t.mtx.RUnlock()

        total := uint64(0)
        for _, ivs := range t.intvlGroups {
                total += uint64(len(ivs))
        }
        return total
}

// AddInterval to an existing memTombstones.
func (t *MemTombstones) AddInterval(ref storage.SeriesRef, itvs ...Interval) {
        t.mtx.Lock()
        defer t.mtx.Unlock()
        for _, itv := range itvs {
                t.intvlGroups[ref] = t.intvlGroups[ref].Add(itv)
        }
}

func (*MemTombstones) Close() error {
        return nil
}

// Interval represents a single time-interval.
type Interval struct {
        Mint, Maxt int64
}

func (tr Interval) InBounds(t int64) bool {
        return t >= tr.Mint && t <= tr.Maxt
}

func (tr Interval) IsSubrange(dranges Intervals) bool {
        for _, r := range dranges {
                if r.InBounds(tr.Mint) && r.InBounds(tr.Maxt) {
                        return true
                }
        }

        return false
}

// Intervals represents        a set of increasing and non-overlapping time-intervals.
type Intervals []Interval

// Add the new time-range to the existing ones.
// The existing ones must be sorted.
func (in Intervals) Add(n Interval) Intervals {
        if len(in) == 0 {
                return append(in, n)
        }
        // Find min and max indexes of intervals that overlap with the new interval.
        // Intervals are closed [t1, t2] and t is discreet, so if neighbour intervals are 1 step difference
        // to the new one, we can merge those together.
        mini := 0
        if n.Mint != math.MinInt64 { // Avoid overflow.
                mini = sort.Search(len(in), func(i int) bool { return in[i].Maxt >= n.Mint-1 })
                if mini == len(in) {
                        return append(in, n)
                }
        }

        maxi := len(in)
        if n.Maxt != math.MaxInt64 { // Avoid overflow.
                maxi = sort.Search(len(in)-mini, func(i int) bool { return in[mini+i].Mint > n.Maxt+1 })
                if maxi == 0 {
                        if mini == 0 {
                                return append(Intervals{n}, in...)
                        }
                        return append(in[:mini], append(Intervals{n}, in[mini:]...)...)
                }
        }

        if n.Mint < in[mini].Mint {
                in[mini].Mint = n.Mint
        }
        in[mini].Maxt = in[maxi+mini-1].Maxt
        if n.Maxt > in[mini].Maxt {
                in[mini].Maxt = n.Maxt
        }
        return append(in[:mini+1], in[maxi+mini:]...)
}

// Copyright 2019 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
        "context"
        "fmt"
        "path/filepath"

        "github.com/go-kit/log"

        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb/chunkenc"
)

var ErrInvalidTimes = fmt.Errorf("max time is lesser than min time")

// CreateBlock creates a chunkrange block from the samples passed to it, and writes it to disk.
func CreateBlock(series []storage.Series, dir string, chunkRange int64, logger log.Logger) (string, error) {
        if chunkRange == 0 {
                chunkRange = DefaultBlockDuration
        }
        if chunkRange < 0 {
                return "", ErrInvalidTimes
        }

        w, err := NewBlockWriter(logger, dir, chunkRange)
        if err != nil {
                return "", err
        }
        defer func() {
                if err := w.Close(); err != nil {
                        logger.Log("err closing blockwriter", err.Error())
                }
        }()

        sampleCount := 0
        const commitAfter = 10000
        ctx := context.Background()
        app := w.Appender(ctx)
        var it chunkenc.Iterator

        for _, s := range series {
                ref := storage.SeriesRef(0)
                it = s.Iterator(it)
                lset := s.Labels()
                typ := it.Next()
                lastTyp := typ
                for ; typ != chunkenc.ValNone; typ = it.Next() {
                        if lastTyp != typ {
                                // The behaviour of appender is undefined if samples of different types
                                // are appended to the same series in a single Commit().
                                if err = app.Commit(); err != nil {
                                        return "", err
                                }
                                app = w.Appender(ctx)
                                sampleCount = 0
                        }

                        switch typ {
                        case chunkenc.ValFloat:
                                t, v := it.At()
                                ref, err = app.Append(ref, lset, t, v)
                        case chunkenc.ValHistogram:
                                t, h := it.AtHistogram(nil)
                                ref, err = app.AppendHistogram(ref, lset, t, h, nil)
                        case chunkenc.ValFloatHistogram:
                                t, fh := it.AtFloatHistogram(nil)
                                ref, err = app.AppendHistogram(ref, lset, t, nil, fh)
                        default:
                                return "", fmt.Errorf("unknown sample type %s", typ.String())
                        }
                        if err != nil {
                                return "", err
                        }
                        sampleCount++
                        lastTyp = typ
                }
                if it.Err() != nil {
                        return "", it.Err()
                }
                // Commit and make a new appender periodically, to avoid building up data in memory.
                if sampleCount > commitAfter {
                        if err = app.Commit(); err != nil {
                                return "", err
                        }
                        app = w.Appender(ctx)
                        sampleCount = 0
                }
        }

        if err = app.Commit(); err != nil {
                return "", err
        }

        ulid, err := w.Flush(ctx)
        if err != nil {
                return "", err
        }

        return filepath.Join(dir, ulid.String()), nil
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdbutil

import (
        "errors"
        "fmt"
        "os"
        "path/filepath"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/prometheus/client_golang/prometheus"

        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
)

const (
        lockfileDisabled       = -1
        lockfileReplaced       = 0
        lockfileCreatedCleanly = 1
)

type DirLocker struct {
        logger log.Logger

        createdCleanly prometheus.Gauge

        releaser fileutil.Releaser
        path     string
}

// NewDirLocker creates a DirLocker that can obtain an exclusive lock on dir.
func NewDirLocker(dir, subsystem string, l log.Logger, r prometheus.Registerer) (*DirLocker, error) {
        lock := &DirLocker{
                logger: l,
                createdCleanly: prometheus.NewGauge(prometheus.GaugeOpts{
                        Name: fmt.Sprintf("prometheus_%s_clean_start", subsystem),
                        Help: "-1: lockfile is disabled. 0: a lockfile from a previous execution was replaced. 1: lockfile creation was clean",
                }),
        }

        if r != nil {
                r.MustRegister(lock.createdCleanly)
        }

        lock.createdCleanly.Set(lockfileDisabled)

        absdir, err := filepath.Abs(dir)
        if err != nil {
                return nil, err
        }
        lock.path = filepath.Join(absdir, "lock")

        return lock, nil
}

// Lock obtains the lock on the locker directory.
func (l *DirLocker) Lock() error {
        if l.releaser != nil {
                return errors.New("DB lock already obtained")
        }

        if _, err := os.Stat(l.path); err == nil {
                level.Warn(l.logger).Log("msg", "A lockfile from a previous execution already existed. It was replaced", "file", l.path)

                l.createdCleanly.Set(lockfileReplaced)
        } else {
                l.createdCleanly.Set(lockfileCreatedCleanly)
        }

        lockf, _, err := fileutil.Flock(l.path)
        if err != nil {
                return fmt.Errorf("lock DB directory: %w", err)
        }
        l.releaser = lockf
        return nil
}

// Release releases the lock. No-op if the lock is not held.
func (l *DirLocker) Release() error {
        if l.releaser == nil {
                return nil
        }

        errs := tsdb_errors.NewMulti()
        errs.Add(l.releaser.Release())
        errs.Add(os.Remove(l.path))

        l.releaser = nil
        return errs.Err()
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdbutil

import (
        "fmt"
        "os"
        "testing"

        "github.com/go-kit/log"
        prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
        "github.com/stretchr/testify/require"

        "github.com/prometheus/prometheus/util/testutil"
)

// TestDirLockerUsage performs a set of tests which guarantee correct usage of
// DirLocker. open should use data as the storage directory, and createLock
// to determine if a lock file should be used.
func TestDirLockerUsage(t *testing.T, open func(t *testing.T, data string, createLock bool) (*DirLocker, testutil.Closer)) {
        t.Helper()

        cases := []struct {
                fileAlreadyExists bool
                lockFileDisabled  bool
                expectedValue     int
        }{
                {
                        fileAlreadyExists: false,
                        lockFileDisabled:  false,
                        expectedValue:     lockfileCreatedCleanly,
                },
                {
                        fileAlreadyExists: true,
                        lockFileDisabled:  false,
                        expectedValue:     lockfileReplaced,
                },
                {
                        fileAlreadyExists: true,
                        lockFileDisabled:  true,
                        expectedValue:     lockfileDisabled,
                },
                {
                        fileAlreadyExists: false,
                        lockFileDisabled:  true,
                        expectedValue:     lockfileDisabled,
                },
        }

        for _, c := range cases {
                t.Run(fmt.Sprintf("%+v", c), func(t *testing.T) {
                        tmpdir, err := os.MkdirTemp("", "test")
                        require.NoError(t, err)
                        t.Cleanup(func() {
                                require.NoError(t, os.RemoveAll(tmpdir))
                        })

                        // Test preconditions (file already exists + lockfile option)
                        if c.fileAlreadyExists {
                                tmpLocker, err := NewDirLocker(tmpdir, "tsdb", log.NewNopLogger(), nil)
                                require.NoError(t, err)
                                err = os.WriteFile(tmpLocker.path, []byte{}, 0o644)
                                require.NoError(t, err)
                        }

                        locker, closer := open(t, tmpdir, !c.lockFileDisabled)
                        require.Equal(t, float64(c.expectedValue), prom_testutil.ToFloat64(locker.createdCleanly))

                        // Close the client. This should delete the lockfile.
                        closer.Close()

                        // Check that the lockfile is always deleted
                        if !c.lockFileDisabled {
                                _, err = os.Stat(locker.path)
                                require.True(t, os.IsNotExist(err), "lockfile was not deleted")
                        }
                })
        }
}

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdbutil

import (
        "math"

        "github.com/prometheus/prometheus/model/histogram"
)

func GenerateTestHistograms(n int) (r []*histogram.Histogram) {
        for i := 0; i < n; i++ {
                h := GenerateTestHistogram(i)
                if i > 0 {
                        h.CounterResetHint = histogram.NotCounterReset
                }
                r = append(r, h)
        }
        return r
}

func GenerateTestHistogramsWithUnknownResetHint(n int) []*histogram.Histogram {
        hs := GenerateTestHistograms(n)
        for i := range hs {
                hs[i].CounterResetHint = histogram.UnknownCounterReset
        }
        return hs
}

// GenerateTestHistogram but it is up to the user to set any known counter reset hint.
func GenerateTestHistogram(i int) *histogram.Histogram {
        return &histogram.Histogram{
                Count:         12 + uint64(i*9),
                ZeroCount:     2 + uint64(i),
                ZeroThreshold: 0.001,
                Sum:           18.4 * float64(i+1),
                Schema:        1,
                PositiveSpans: []histogram.Span{
                        {Offset: 0, Length: 2},
                        {Offset: 1, Length: 2},
                },
                PositiveBuckets: []int64{int64(i + 1), 1, -1, 0},
                NegativeSpans: []histogram.Span{
                        {Offset: 0, Length: 2},
                        {Offset: 1, Length: 2},
                },
                NegativeBuckets: []int64{int64(i + 1), 1, -1, 0},
        }
}

func GenerateTestCustomBucketsHistogram(i int) *histogram.Histogram {
        return &histogram.Histogram{
                Count:  5 + uint64(i*4),
                Sum:    18.4 * float64(i+1),
                Schema: histogram.CustomBucketsSchema,
                PositiveSpans: []histogram.Span{
                        {Offset: 0, Length: 2},
                        {Offset: 1, Length: 2},
                },
                PositiveBuckets: []int64{int64(i + 1), 1, -1, 0},
                CustomValues:    []float64{0, 1, 2, 3, 4},
        }
}

func GenerateTestGaugeHistograms(n int) (r []*histogram.Histogram) {
        for x := 0; x < n; x++ {
                i := int(math.Sin(float64(x))*100) + 100
                r = append(r, GenerateTestGaugeHistogram(i))
        }
        return r
}

func GenerateTestGaugeHistogram(i int) *histogram.Histogram {
        h := GenerateTestHistogram(i)
        h.CounterResetHint = histogram.GaugeType
        return h
}

func GenerateTestFloatHistograms(n int) (r []*histogram.FloatHistogram) {
        for i := 0; i < n; i++ {
                h := GenerateTestFloatHistogram(i)
                if i > 0 {
                        h.CounterResetHint = histogram.NotCounterReset
                }
                r = append(r, h)
        }
        return r
}

// GenerateTestFloatHistogram but it is up to the user to set any known counter reset hint.
func GenerateTestFloatHistogram(i int) *histogram.FloatHistogram {
        return &histogram.FloatHistogram{
                Count:         12 + float64(i*9),
                ZeroCount:     2 + float64(i),
                ZeroThreshold: 0.001,
                Sum:           18.4 * float64(i+1),
                Schema:        1,
                PositiveSpans: []histogram.Span{
                        {Offset: 0, Length: 2},
                        {Offset: 1, Length: 2},
                },
                PositiveBuckets: []float64{float64(i + 1), float64(i + 2), float64(i + 1), float64(i + 1)},
                NegativeSpans: []histogram.Span{
                        {Offset: 0, Length: 2},
                        {Offset: 1, Length: 2},
                },
                NegativeBuckets: []float64{float64(i + 1), float64(i + 2), float64(i + 1), float64(i + 1)},
        }
}

func GenerateTestCustomBucketsFloatHistogram(i int) *histogram.FloatHistogram {
        return &histogram.FloatHistogram{
                Count:  5 + float64(i*4),
                Sum:    18.4 * float64(i+1),
                Schema: histogram.CustomBucketsSchema,
                PositiveSpans: []histogram.Span{
                        {Offset: 0, Length: 2},
                        {Offset: 1, Length: 2},
                },
                PositiveBuckets: []float64{float64(i + 1), float64(i + 2), float64(i + 1), float64(i + 1)},
                CustomValues:    []float64{0, 1, 2, 3, 4},
        }
}

func GenerateTestGaugeFloatHistograms(n int) (r []*histogram.FloatHistogram) {
        for x := 0; x < n; x++ {
                i := int(math.Sin(float64(x))*100) + 100
                r = append(r, GenerateTestGaugeFloatHistogram(i))
        }
        return r
}

func GenerateTestGaugeFloatHistogram(i int) *histogram.FloatHistogram {
        h := GenerateTestFloatHistogram(i)
        h.CounterResetHint = histogram.GaugeType
        return h
}

func SetHistogramNotCounterReset(h *histogram.Histogram) *histogram.Histogram {
        h.CounterResetHint = histogram.NotCounterReset
        return h
}

func SetHistogramCounterReset(h *histogram.Histogram) *histogram.Histogram {
        h.CounterResetHint = histogram.CounterReset
        return h
}

func SetFloatHistogramNotCounterReset(h *histogram.FloatHistogram) *histogram.FloatHistogram {
        h.CounterResetHint = histogram.NotCounterReset
        return h
}

func SetFloatHistogramCounterReset(h *histogram.FloatHistogram) *histogram.FloatHistogram {
        h.CounterResetHint = histogram.CounterReset
        return h
}

// Copyright 2018 The Prometheus Authors

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wlog

import (
        "errors"
        "fmt"
        "io"
        "math"
        "os"
        "path/filepath"
        "slices"
        "strconv"
        "strings"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/tsdb/chunks"
        tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
        "github.com/prometheus/prometheus/tsdb/fileutil"
        "github.com/prometheus/prometheus/tsdb/record"
        "github.com/prometheus/prometheus/tsdb/tombstones"
)

// CheckpointStats returns stats about a created checkpoint.
type CheckpointStats struct {
        DroppedSeries     int
        DroppedSamples    int // Includes histograms.
        DroppedTombstones int
        DroppedExemplars  int
        DroppedMetadata   int
        TotalSeries       int // Processed series including dropped ones.
        TotalSamples      int // Processed float and histogram samples including dropped ones.
        TotalTombstones   int // Processed tombstones including dropped ones.
        TotalExemplars    int // Processed exemplars including dropped ones.
        TotalMetadata     int // Processed metadata including dropped ones.
}

// LastCheckpoint returns the directory name and index of the most recent checkpoint.
// If dir does not contain any checkpoints, ErrNotFound is returned.
func LastCheckpoint(dir string) (string, int, error) {
        checkpoints, err := listCheckpoints(dir)
        if err != nil {
                return "", 0, err
        }

        if len(checkpoints) == 0 {
                return "", 0, record.ErrNotFound
        }

        checkpoint := checkpoints[len(checkpoints)-1]
        return filepath.Join(dir, checkpoint.name), checkpoint.index, nil
}

// DeleteCheckpoints deletes all checkpoints in a directory below a given index.
func DeleteCheckpoints(dir string, maxIndex int) error {
        checkpoints, err := listCheckpoints(dir)
        if err != nil {
                return err
        }

        errs := tsdb_errors.NewMulti()
        for _, checkpoint := range checkpoints {
                if checkpoint.index >= maxIndex {
                        break
                }
                errs.Add(os.RemoveAll(filepath.Join(dir, checkpoint.name)))
        }
        return errs.Err()
}

const checkpointPrefix = "checkpoint."

// Checkpoint creates a compacted checkpoint of segments in range [from, to] in the given WAL.
// It includes the most recent checkpoint if it exists.
// All series not satisfying keep, samples/tombstones/exemplars below mint and
// metadata that are not the latest are dropped.
//
// The checkpoint is stored in a directory named checkpoint.N in the same
// segmented format as the original WAL itself.
// This makes it easy to read it through the WAL package and concatenate
// it with the original WAL.
func Checkpoint(logger log.Logger, w *WL, from, to int, keep func(id chunks.HeadSeriesRef) bool, mint int64) (*CheckpointStats, error) {
        stats := &CheckpointStats{}
        var sgmReader io.ReadCloser

        level.Info(logger).Log("msg", "Creating checkpoint", "from_segment", from, "to_segment", to, "mint", mint)

        {
                var sgmRange []SegmentRange
                dir, idx, err := LastCheckpoint(w.Dir())
                if err != nil && !errors.Is(err, record.ErrNotFound) {
                        return nil, fmt.Errorf("find last checkpoint: %w", err)
                }
                last := idx + 1
                if err == nil {
                        if from > last {
                                return nil, fmt.Errorf("unexpected gap to last checkpoint. expected:%v, requested:%v", last, from)
                        }
                        // Ignore WAL files below the checkpoint. They shouldn't exist to begin with.
                        from = last

                        sgmRange = append(sgmRange, SegmentRange{Dir: dir, Last: math.MaxInt32})
                }

                sgmRange = append(sgmRange, SegmentRange{Dir: w.Dir(), First: from, Last: to})
                sgmReader, err = NewSegmentsRangeReader(sgmRange...)
                if err != nil {
                        return nil, fmt.Errorf("create segment reader: %w", err)
                }
                defer sgmReader.Close()
        }

        cpdir := checkpointDir(w.Dir(), to)
        cpdirtmp := cpdir + ".tmp"

        if err := os.RemoveAll(cpdirtmp); err != nil {
                return nil, fmt.Errorf("remove previous temporary checkpoint dir: %w", err)
        }

        if err := os.MkdirAll(cpdirtmp, 0o777); err != nil {
                return nil, fmt.Errorf("create checkpoint dir: %w", err)
        }
        cp, err := New(nil, nil, cpdirtmp, w.CompressionType())
        if err != nil {
                return nil, fmt.Errorf("open checkpoint: %w", err)
        }

        // Ensures that an early return caused by an error doesn't leave any tmp files.
        defer func() {
                cp.Close()
                os.RemoveAll(cpdirtmp)
        }()

        r := NewReader(sgmReader)

        var (
                series                []record.RefSeries
                samples               []record.RefSample
                histogramSamples      []record.RefHistogramSample
                floatHistogramSamples []record.RefFloatHistogramSample
                tstones               []tombstones.Stone
                exemplars             []record.RefExemplar
                metadata              []record.RefMetadata
                st                    = labels.NewSymbolTable() // Needed for decoding; labels do not outlive this function.
                dec                   = record.NewDecoder(st)
                enc                   record.Encoder
                buf                   []byte
                recs                  [][]byte

                latestMetadataMap = make(map[chunks.HeadSeriesRef]record.RefMetadata)
        )
        for r.Next() {
                series, samples, histogramSamples, floatHistogramSamples, tstones, exemplars, metadata = series[:0], samples[:0], histogramSamples[:0], floatHistogramSamples[:0], tstones[:0], exemplars[:0], metadata[:0]

                // We don't reset the buffer since we batch up multiple records
                // before writing them to the checkpoint.
                // Remember where the record for this iteration starts.
                start := len(buf)
                rec := r.Record()

                switch dec.Type(rec) {
                case record.Series:
                        series, err = dec.Series(rec, series)
                        if err != nil {
                                return nil, fmt.Errorf("decode series: %w", err)
                        }
                        // Drop irrelevant series in place.
                        repl := series[:0]
                        for _, s := range series {
                                if keep(s.Ref) {
                                        repl = append(repl, s)
                                }
                        }
                        if len(repl) > 0 {
                                buf = enc.Series(repl, buf)
                        }
                        stats.TotalSeries += len(series)
                        stats.DroppedSeries += len(series) - len(repl)

                case record.Samples:
                        samples, err = dec.Samples(rec, samples)
                        if err != nil {
                                return nil, fmt.Errorf("decode samples: %w", err)
                        }
                        // Drop irrelevant samples in place.
                        repl := samples[:0]
                        for _, s := range samples {
                                if s.T >= mint {
                                        repl = append(repl, s)
                                }
                        }
                        if len(repl) > 0 {
                                buf = enc.Samples(repl, buf)
                        }
                        stats.TotalSamples += len(samples)
                        stats.DroppedSamples += len(samples) - len(repl)

                case record.HistogramSamples:
                        histogramSamples, err = dec.HistogramSamples(rec, histogramSamples)
                        if err != nil {
                                return nil, fmt.Errorf("decode histogram samples: %w", err)
                        }
                        // Drop irrelevant histogramSamples in place.
                        repl := histogramSamples[:0]
                        for _, h := range histogramSamples {
                                if h.T >= mint {
                                        repl = append(repl, h)
                                }
                        }
                        if len(repl) > 0 {
                                buf = enc.HistogramSamples(repl, buf)
                        }
                        stats.TotalSamples += len(histogramSamples)
                        stats.DroppedSamples += len(histogramSamples) - len(repl)

                case record.FloatHistogramSamples:
                        floatHistogramSamples, err = dec.FloatHistogramSamples(rec, floatHistogramSamples)
                        if err != nil {
                                return nil, fmt.Errorf("decode float histogram samples: %w", err)
                        }
                        // Drop irrelevant floatHistogramSamples in place.
                        repl := floatHistogramSamples[:0]
                        for _, fh := range floatHistogramSamples {
                                if fh.T >= mint {
                                        repl = append(repl, fh)
                                }
                        }
                        if len(repl) > 0 {
                                buf = enc.FloatHistogramSamples(repl, buf)
                        }
                        stats.TotalSamples += len(floatHistogramSamples)
                        stats.DroppedSamples += len(floatHistogramSamples) - len(repl)

                case record.Tombstones:
                        tstones, err = dec.Tombstones(rec, tstones)
                        if err != nil {
                                return nil, fmt.Errorf("decode deletes: %w", err)
                        }
                        // Drop irrelevant tombstones in place.
                        repl := tstones[:0]
                        for _, s := range tstones {
                                for _, iv := range s.Intervals {
                                        if iv.Maxt >= mint {
                                                repl = append(repl, s)
                                                break
                                        }
                                }
                        }
                        if len(repl) > 0 {
                                buf = enc.Tombstones(repl, buf)
                        }
                        stats.TotalTombstones += len(tstones)
                        stats.DroppedTombstones += len(tstones) - len(repl)

                case record.Exemplars:
                        exemplars, err = dec.Exemplars(rec, exemplars)
                        if err != nil {
                                return nil, fmt.Errorf("decode exemplars: %w", err)
                        }
                        // Drop irrelevant exemplars in place.
                        repl := exemplars[:0]
                        for _, e := range exemplars {
                                if e.T >= mint {
                                        repl = append(repl, e)
                                }
                        }
                        if len(repl) > 0 {
                                buf = enc.Exemplars(repl, buf)
                        }
                        stats.TotalExemplars += len(exemplars)
                        stats.DroppedExemplars += len(exemplars) - len(repl)
                case record.Metadata:
                        metadata, err := dec.Metadata(rec, metadata)
                        if err != nil {
                                return nil, fmt.Errorf("decode metadata: %w", err)
                        }
                        // Only keep reference to the latest found metadata for each refID.
                        repl := 0
                        for _, m := range metadata {
                                if keep(m.Ref) {
                                        if _, ok := latestMetadataMap[m.Ref]; !ok {
                                                repl++
                                        }
                                        latestMetadataMap[m.Ref] = m
                                }
                        }
                        stats.TotalMetadata += len(metadata)
                        stats.DroppedMetadata += len(metadata) - repl
                default:
                        // Unknown record type, probably from a future Prometheus version.
                        continue
                }
                if len(buf[start:]) == 0 {
                        continue // All contents discarded.
                }
                recs = append(recs, buf[start:])

                // Flush records in 1 MB increments.
                if len(buf) > 1*1024*1024 {
                        if err := cp.Log(recs...); err != nil {
                                return nil, fmt.Errorf("flush records: %w", err)
                        }
                        buf, recs = buf[:0], recs[:0]
                }
        }
        // If we hit any corruption during checkpointing, repairing is not an option.
        // The head won't know which series records are lost.
        if r.Err() != nil {
                return nil, fmt.Errorf("read segments: %w", r.Err())
        }

        // Flush remaining records.
        if err := cp.Log(recs...); err != nil {
                return nil, fmt.Errorf("flush records: %w", err)
        }

        // Flush latest metadata records for each series.
        if len(latestMetadataMap) > 0 {
                latestMetadata := make([]record.RefMetadata, 0, len(latestMetadataMap))
                for _, m := range latestMetadataMap {
                        latestMetadata = append(latestMetadata, m)
                }
                if err := cp.Log(enc.Metadata(latestMetadata, buf[:0])); err != nil {
                        return nil, fmt.Errorf("flush metadata records: %w", err)
                }
        }

        if err := cp.Close(); err != nil {
                return nil, fmt.Errorf("close checkpoint: %w", err)
        }

        // Sync temporary directory before rename.
        df, err := fileutil.OpenDir(cpdirtmp)
        if err != nil {
                return nil, fmt.Errorf("open temporary checkpoint directory: %w", err)
        }
        if err := df.Sync(); err != nil {
                df.Close()
                return nil, fmt.Errorf("sync temporary checkpoint directory: %w", err)
        }
        if err = df.Close(); err != nil {
                return nil, fmt.Errorf("close temporary checkpoint directory: %w", err)
        }

        if err := fileutil.Replace(cpdirtmp, cpdir); err != nil {
                return nil, fmt.Errorf("rename checkpoint directory: %w", err)
        }

        return stats, nil
}

func checkpointDir(dir string, i int) string {
        return filepath.Join(dir, fmt.Sprintf(checkpointPrefix+"%08d", i))
}

type checkpointRef struct {
        name  string
        index int
}

func listCheckpoints(dir string) (refs []checkpointRef, err error) {
        files, err := os.ReadDir(dir)
        if err != nil {
                return nil, err
        }

        for i := 0; i < len(files); i++ {
                fi := files[i]
                if !strings.HasPrefix(fi.Name(), checkpointPrefix) {
                        continue
                }
                if !fi.IsDir() {
                        return nil, fmt.Errorf("checkpoint %s is not a directory", fi.Name())
                }
                idx, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):])
                if err != nil {
                        continue
                }

                refs = append(refs, checkpointRef{name: fi.Name(), index: idx})
        }

        slices.SortFunc(refs, func(a, b checkpointRef) int {
                return a.index - b.index
        })

        return refs, nil
}

// Copyright 2019 The Prometheus Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wlog

import (
        "encoding/binary"
        "errors"
        "fmt"
        "hash/crc32"
        "io"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/golang/snappy"
        "github.com/klauspost/compress/zstd"
        "github.com/prometheus/client_golang/prometheus"
)

// LiveReaderMetrics holds all metrics exposed by the LiveReader.
type LiveReaderMetrics struct {
        readerCorruptionErrors *prometheus.CounterVec
}

// NewLiveReaderMetrics instantiates, registers and returns metrics to be injected
// at LiveReader instantiation.
func NewLiveReaderMetrics(reg prometheus.Registerer) *LiveReaderMetrics {
        m := &LiveReaderMetrics{
                readerCorruptionErrors: prometheus.NewCounterVec(prometheus.CounterOpts{
                        Name: "prometheus_tsdb_wal_reader_corruption_errors_total",
                        Help: "Errors encountered when reading the WAL.",
                }, []string{"error"}),
        }

        if reg != nil {
                reg.MustRegister(m.readerCorruptionErrors)
        }

        return m
}

// NewLiveReader returns a new live reader.
func NewLiveReader(logger log.Logger, metrics *LiveReaderMetrics, r io.Reader) *LiveReader {
        // Calling zstd.NewReader with a nil io.Reader and no options cannot return an error.
        zstdReader, _ := zstd.NewReader(nil)

        lr := &LiveReader{
                logger:     logger,
                rdr:        r,
                zstdReader: zstdReader,
                metrics:    metrics,

                // Until we understand how they come about, make readers permissive
                // to records spanning pages.
                permissive: true,
        }

        return lr
}

// LiveReader reads WAL records from an io.Reader. It allows reading of WALs
// that are still in the process of being written, and returns records as soon
// as they can be read.
type LiveReader struct {
        logger      log.Logger
        rdr         io.Reader
        err         error
        rec         []byte
        compressBuf []byte
        zstdReader  *zstd.Decoder
        hdr         [recordHeaderSize]byte
        buf         [pageSize]byte
        readIndex   int   // Index in buf to start at for next read.
        writeIndex  int   // Index in buf to start at for next write.
        total       int64 // Total bytes processed during reading in calls to Next().
        index       int   // Used to track partial records, should be 0 at the start of every new record.

        // For testing, we can treat EOF as a non-error.
        eofNonErr bool

        // We sometime see records span page boundaries.  Should never happen, but it
        // does.  Until we track down why, set permissive to true to tolerate it.
        // NB the non-ive Reader implementation allows for this.
        permissive bool

        metrics *LiveReaderMetrics
}

// Err returns any errors encountered reading the WAL.  io.EOFs are not terminal
// and Next can be tried again.  Non-EOFs are terminal, and the reader should
// not be used again.  It is up to the user to decide when to stop trying should
// io.EOF be returned.
func (r *LiveReader) Err() error {
        if r.eofNonErr && errors.Is(r.err, io.EOF) {
                return nil
        }
        return r.err
}

// Offset returns the number of bytes consumed from this segment.
func (r *LiveReader) Offset() int64 {
        return r.total
}

func (r *LiveReader) fillBuffer() (int, error) {
        n, err := r.rdr.Read(r.buf[r.writeIndex:len(r.buf)])
        r.writeIndex += n
        return n, err
}

// Next returns true if Record() will contain a full record.
// If Next returns false, you should always checked the contents of Error().
// Return false guarantees there are no more records if the segment is closed
// and not corrupt, otherwise if Err() == io.EOF you should try again when more
// data has been written.
func (r *LiveReader) Next() bool {
        for {
                // If buildRecord returns a non-EOF error, its game up - the segment is
                // corrupt. If buildRecord returns an EOF, we try and read more in
                // fillBuffer later on. If that fails to read anything (n=0 && err=EOF),
                // we return  EOF and the user can try again later. If we have a full
                // page, buildRecord is guaranteed to return a record or a non-EOF; it
                // has checks the records fit in pages.
                switch ok, err := r.buildRecord(); {
                case ok:
                        return true
                case err != nil && !errors.Is(err, io.EOF):
                        r.err = err
                        return false
                }

                // If we've filled the page and not found a record, this
                // means records have started to span pages.  Shouldn't happen
                // but does and until we found out why, we need to deal with this.
                if r.permissive && r.writeIndex == pageSize && r.readIndex > 0 {
                        copy(r.buf[:], r.buf[r.readIndex:])
                        r.writeIndex -= r.readIndex
                        r.readIndex = 0
                        continue
                }

                if r.readIndex == pageSize {
                        r.writeIndex = 0
                        r.readIndex = 0
                }

                if r.writeIndex != pageSize {
                        n, err := r.fillBuffer()
                        if n == 0 || (err != nil && !errors.Is(err, io.EOF)) {
                                r.err = err
                                return false
                        }
                }
        }
}

// Record returns the current record.
// The returned byte slice is only valid until the next call to Next.
func (r *LiveReader) Record() []byte {
        return r.rec
}

// Rebuild a full record from potentially partial records. Returns false
// if there was an error or if we weren't able to read a record for any reason.
// Returns true if we read a full record. Any record data is appended to
// LiveReader.rec.
func (r *LiveReader) buildRecord() (bool, error) {
        for {
                // Check that we have data in the internal buffer to read.
                if r.writeIndex <= r.readIndex {
                        return false, nil
                }

                // Attempt to read a record, partial or otherwise.
                temp, n, err := r.readRecord()
                if err != nil {
                        return false, err
                }

                r.readIndex += n
                r.total += int64(n)
                if temp == nil {
                        return false, nil
                }

                rt := recTypeFromHeader(r.hdr[0])
                if rt == recFirst || rt == recFull {
                        r.rec = r.rec[:0]
                        r.compressBuf = r.compressBuf[:0]
                }

                isSnappyCompressed := r.hdr[0]&snappyMask == snappyMask
                isZstdCompressed := r.hdr[0]&zstdMask == zstdMask

                if isSnappyCompressed || isZstdCompressed {
                        r.compressBuf = append(r.compressBuf, temp...)
                } else {
                        r.rec = append(r.rec, temp...)
                }

                if err := validateRecord(rt, r.index); err != nil {
                        r.index = 0
                        return false, err
                }
                if rt == recLast || rt == recFull {
                        r.index = 0
                        if isSnappyCompressed && len(r.compressBuf) > 0 {
                                // The snappy library uses `len` to calculate if we need a new buffer.
                                // In order to allocate as few buffers as possible make the length
                                // equal to the capacity.
                                r.rec = r.rec[:cap(r.rec)]
                                r.rec, err = snappy.Decode(r.rec, r.compressBuf)
                                if err != nil {
                                        return false, err
                                }
                        } else if isZstdCompressed && len(r.compressBuf) > 0 {
                                r.rec, err = r.zstdReader.DecodeAll(r.compressBuf, r.rec[:0])
                                if err != nil {
                                        return false, err
                                }
                        }
                        return true, nil
                }
                // Only increment i for non-zero records since we use it
                // to determine valid content record sequences.
                r.index++
        }
}

// Returns an error if the recType and i indicate an invalid record sequence.
// As an example, if i is > 0 because we've read some amount of a partial record
// (recFirst, recMiddle, etc. but not recLast) and then we get another recFirst or recFull
// instead of a recLast or recMiddle we would have an invalid record.
func validateRecord(typ recType, i int) error {
        switch typ {
        case recFull:
                if i != 0 {
                        return errors.New("unexpected full record")
                }
                return nil
        case recFirst:
                if i != 0 {
                        return errors.New("unexpected first record, dropping buffer")
                }
                return nil
        case recMiddle:
                if i == 0 {
                        return errors.New("unexpected middle record, dropping buffer")
                }
                return nil
        case recLast:
                if i == 0 {
                        return errors.New("unexpected last record, dropping buffer")
                }
                return nil
        default:
                return fmt.Errorf("unexpected record type %d", typ)
        }
}

// Read a sub-record (see recType) from the buffer. It could potentially
// be a full record (recFull) if the record fits within the bounds of a single page.
// Returns a byte slice of the record data read, the number of bytes read, and an error
// if there's a non-zero byte in a page term record or the record checksum fails.
// This is a non-method function to make it clear it does not mutate the reader.
func (r *LiveReader) readRecord() ([]byte, int, error) {
        // Special case: for recPageTerm, check that are all zeros to end of page,
        // consume them but don't return them.
        if r.buf[r.readIndex] == byte(recPageTerm) {
                // End of page won't necessarily be end of buffer, as we may have
                // got misaligned by records spanning page boundaries.
                // r.total % pageSize is the offset into the current page
                // that r.readIndex points to in buf.  Therefore
                // pageSize - (r.total % pageSize) is the amount left to read of
                // the current page.
                remaining := int(pageSize - (r.total % pageSize))
                if r.readIndex+remaining > r.writeIndex {
                        return nil, 0, io.EOF
                }

                for i := r.readIndex; i < r.readIndex+remaining; i++ {
                        if r.buf[i] != 0 {
                                return nil, 0, errors.New("unexpected non-zero byte in page term bytes")
                        }
                }

                return nil, remaining, nil
        }

        // Not a recPageTerm; read the record and check the checksum.
        if r.writeIndex-r.readIndex < recordHeaderSize {
                return nil, 0, io.EOF
        }

        copy(r.hdr[:], r.buf[r.readIndex:r.readIndex+recordHeaderSize])
        length := int(binary.BigEndian.Uint16(r.hdr[1:]))
        crc := binary.BigEndian.Uint32(r.hdr[3:])
        if r.readIndex+recordHeaderSize+length > pageSize {
                if !r.permissive {
                        return nil, 0, fmt.Errorf("record would overflow current page: %d > %d", r.readIndex+recordHeaderSize+length, pageSize)
                }
                r.metrics.readerCorruptionErrors.WithLabelValues("record_span_page").Inc()
                level.Warn(r.logger).Log("msg", "Record spans page boundaries", "start", r.readIndex, "end", recordHeaderSize+length, "pageSize", pageSize)
        }
        if recordHeaderSize+length > pageSize {
                return nil, 0, fmt.Errorf("record length greater than a single page: %d > %d", recordHeaderSize+length, pageSize)
        }
        if r.readIndex+recordHeaderSize+length > r.writeIndex {
                return nil, 0, io.EOF
        }

        rec := r.buf[r.readIndex+recordHeaderSize : r.readIndex+recordHeaderSize+length]
        if c := crc32.Checksum(rec, castagnoliTable); c != crc {
                return nil, 0, fmt.Errorf("unexpected checksum %x, expected %x", c, crc)
        }

        return rec, length + recordHeaderSize, nil
}

// Copyright 2019 The Prometheus Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wlog

import (
        "encoding/binary"
        "errors"
        "fmt"
        "hash/crc32"
        "io"

        "github.com/golang/snappy"
        "github.com/klauspost/compress/zstd"
)

// Reader reads WAL records from an io.Reader.
type Reader struct {
        rdr         io.Reader
        err         error
        rec         []byte
        compressBuf []byte
        zstdReader  *zstd.Decoder
        buf         [pageSize]byte
        total       int64   // Total bytes processed.
        curRecTyp   recType // Used for checking that the last record is not torn.
}

// NewReader returns a new reader.
func NewReader(r io.Reader) *Reader {
        // Calling zstd.NewReader with a nil io.Reader and no options cannot return an error.
        zstdReader, _ := zstd.NewReader(nil)
        return &Reader{rdr: r, zstdReader: zstdReader}
}

// Next advances the reader to the next records and returns true if it exists.
// It must not be called again after it returned false.
func (r *Reader) Next() bool {
        err := r.next()
        if err != nil && errors.Is(err, io.EOF) {
                // The last WAL segment record shouldn't be torn(should be full or last).
                // The last record would be torn after a crash just before
                // the last record part could be persisted to disk.
                if r.curRecTyp == recFirst || r.curRecTyp == recMiddle {
                        r.err = errors.New("last record is torn")
                }
                return false
        }
        r.err = err
        return r.err == nil
}

func (r *Reader) next() (err error) {
        // We have to use r.buf since allocating byte arrays here fails escape
        // analysis and ends up on the heap, even though it seemingly should not.
        hdr := r.buf[:recordHeaderSize]
        buf := r.buf[recordHeaderSize:]

        r.rec = r.rec[:0]
        r.compressBuf = r.compressBuf[:0]

        i := 0
        for {
                if _, err = io.ReadFull(r.rdr, hdr[:1]); err != nil {
                        return fmt.Errorf("read first header byte: %w", err)
                }
                r.total++
                r.curRecTyp = recTypeFromHeader(hdr[0])
                isSnappyCompressed := hdr[0]&snappyMask == snappyMask
                isZstdCompressed := hdr[0]&zstdMask == zstdMask

                // Gobble up zero bytes.
                if r.curRecTyp == recPageTerm {
                        // recPageTerm is a single byte that indicates the rest of the page is padded.
                        // If it's the first byte in a page, buf is too small and
                        // needs to be resized to fit pageSize-1 bytes.
                        buf = r.buf[1:]

                        // We are pedantic and check whether the zeros are actually up
                        // to a page boundary.
                        // It's not strictly necessary but may catch sketchy state early.
                        k := pageSize - (r.total % pageSize)
                        if k == pageSize {
                                continue // Initial 0 byte was last page byte.
                        }
                        n, err := io.ReadFull(r.rdr, buf[:k])
                        if err != nil {
                                return fmt.Errorf("read remaining zeros: %w", err)
                        }
                        r.total += int64(n)

                        for _, c := range buf[:k] {
                                if c != 0 {
                                        return errors.New("unexpected non-zero byte in padded page")
                                }
                        }
                        continue
                }
                n, err := io.ReadFull(r.rdr, hdr[1:])
                if err != nil {
                        return fmt.Errorf("read remaining header: %w", err)
                }
                r.total += int64(n)

                var (
                        length = binary.BigEndian.Uint16(hdr[1:])
                        crc    = binary.BigEndian.Uint32(hdr[3:])
                )

                if length > pageSize-recordHeaderSize {
                        return fmt.Errorf("invalid record size %d", length)
                }
                n, err = io.ReadFull(r.rdr, buf[:length])
                if err != nil {
                        return err
                }
                r.total += int64(n)

                if n != int(length) {
                        return fmt.Errorf("invalid size: expected %d, got %d", length, n)
                }
                if c := crc32.Checksum(buf[:length], castagnoliTable); c != crc {
                        return fmt.Errorf("unexpected checksum %x, expected %x", c, crc)
                }

                if isSnappyCompressed || isZstdCompressed {
                        r.compressBuf = append(r.compressBuf, buf[:length]...)
                } else {
                        r.rec = append(r.rec, buf[:length]...)
                }

                if err := validateRecord(r.curRecTyp, i); err != nil {
                        return err
                }
                if r.curRecTyp == recLast || r.curRecTyp == recFull {
                        if isSnappyCompressed && len(r.compressBuf) > 0 {
                                // The snappy library uses `len` to calculate if we need a new buffer.
                                // In order to allocate as few buffers as possible make the length
                                // equal to the capacity.
                                r.rec = r.rec[:cap(r.rec)]
                                r.rec, err = snappy.Decode(r.rec, r.compressBuf)
                                return err
                        } else if isZstdCompressed && len(r.compressBuf) > 0 {
                                r.rec, err = r.zstdReader.DecodeAll(r.compressBuf, r.rec[:0])
                                return err
                        }
                        return nil
                }

                // Only increment i for non-zero records since we use it
                // to determine valid content record sequences.
                i++
        }
}

// Err returns the last encountered error wrapped in a corruption error.
// If the reader does not allow to infer a segment index and offset, a total
// offset in the reader stream will be provided.
func (r *Reader) Err() error {
        if r.err == nil {
                return nil
        }
        if b, ok := r.rdr.(*segmentBufReader); ok {
                return &CorruptionErr{
                        Err:     r.err,
                        Dir:     b.segs[b.cur].Dir(),
                        Segment: b.segs[b.cur].Index(),
                        Offset:  int64(b.off),
                }
        }
        return &CorruptionErr{
                Err:     r.err,
                Segment: -1,
                Offset:  r.total,
        }
}

// Record returns the current record. The returned byte slice is only
// valid until the next call to Next.
func (r *Reader) Record() []byte {
        return r.rec
}

// Segment returns the current segment being read.
func (r *Reader) Segment() int {
        if b, ok := r.rdr.(*segmentBufReader); ok {
                return b.segs[b.cur].Index()
        }
        return -1
}

// Offset returns the current position of the segment being read.
func (r *Reader) Offset() int64 {
        if b, ok := r.rdr.(*segmentBufReader); ok {
                return int64(b.off)
        }
        return r.total
}

// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wlog

import (
        "errors"
        "fmt"
        "io"
        "math"
        "os"
        "path/filepath"
        "slices"
        "strconv"
        "strings"
        "time"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/prometheus/client_golang/prometheus"

        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/timestamp"
        "github.com/prometheus/prometheus/tsdb/record"
)

const (
        checkpointPeriod   = 5 * time.Second
        segmentCheckPeriod = 100 * time.Millisecond
        consumer           = "consumer"
)

var (
        ErrIgnorable = errors.New("ignore me")
        readTimeout  = 15 * time.Second
)

// WriteTo is an interface used by the Watcher to send the samples it's read
// from the WAL on to somewhere else. Functions will be called concurrently
// and it is left to the implementer to make sure they are safe.
type WriteTo interface {
        // Append and AppendExemplar should block until the samples are fully accepted,
        // whether enqueued in memory or successfully written to it's final destination.
        // Once returned, the WAL Watcher will not attempt to pass that data again.
        Append([]record.RefSample) bool
        AppendExemplars([]record.RefExemplar) bool
        AppendHistograms([]record.RefHistogramSample) bool
        AppendFloatHistograms([]record.RefFloatHistogramSample) bool
        StoreSeries([]record.RefSeries, int)

        // Next two methods are intended for garbage-collection: first we call
        // UpdateSeriesSegment on all current series
        UpdateSeriesSegment([]record.RefSeries, int)
        // Then SeriesReset is called to allow the deletion
        // of all series created in a segment lower than the argument.
        SeriesReset(int)
}

// Used to notify the watcher that data has been written so that it can read.
type WriteNotified interface {
        Notify()
}

type WatcherMetrics struct {
        recordsRead           *prometheus.CounterVec
        recordDecodeFails     *prometheus.CounterVec
        samplesSentPreTailing *prometheus.CounterVec
        currentSegment        *prometheus.GaugeVec
        notificationsSkipped  *prometheus.CounterVec
}

// Watcher watches the TSDB WAL for a given WriteTo.
type Watcher struct {
        name           string
        writer         WriteTo
        logger         log.Logger
        walDir         string
        lastCheckpoint string
        sendExemplars  bool
        sendHistograms bool
        metrics        *WatcherMetrics
        readerMetrics  *LiveReaderMetrics

        startTime      time.Time
        startTimestamp int64 // the start time as a Prometheus timestamp
        sendSamples    bool

        recordsReadMetric       *prometheus.CounterVec
        recordDecodeFailsMetric prometheus.Counter
        samplesSentPreTailing   prometheus.Counter
        currentSegmentMetric    prometheus.Gauge
        notificationsSkipped    prometheus.Counter

        readNotify chan struct{}
        quit       chan struct{}
        done       chan struct{}

        // For testing, stop when we hit this segment.
        MaxSegment int
}

func NewWatcherMetrics(reg prometheus.Registerer) *WatcherMetrics {
        m := &WatcherMetrics{
                recordsRead: prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Namespace: "prometheus",
                                Subsystem: "wal_watcher",
                                Name:      "records_read_total",
                                Help:      "Number of records read by the WAL watcher from the WAL.",
                        },
                        []string{consumer, "type"},
                ),
                recordDecodeFails: prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Namespace: "prometheus",
                                Subsystem: "wal_watcher",
                                Name:      "record_decode_failures_total",
                                Help:      "Number of records read by the WAL watcher that resulted in an error when decoding.",
                        },
                        []string{consumer},
                ),
                samplesSentPreTailing: prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Namespace: "prometheus",
                                Subsystem: "wal_watcher",
                                Name:      "samples_sent_pre_tailing_total",
                                Help:      "Number of sample records read by the WAL watcher and sent to remote write during replay of existing WAL.",
                        },
                        []string{consumer},
                ),
                currentSegment: prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Namespace: "prometheus",
                                Subsystem: "wal_watcher",
                                Name:      "current_segment",
                                Help:      "Current segment the WAL watcher is reading records from.",
                        },
                        []string{consumer},
                ),
                notificationsSkipped: prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Namespace: "prometheus",
                                Subsystem: "wal_watcher",
                                Name:      "notifications_skipped_total",
                                Help:      "The number of WAL write notifications that the Watcher has skipped due to already being in a WAL read routine.",
                        },
                        []string{consumer},
                ),
        }

        if reg != nil {
                reg.MustRegister(m.recordsRead)
                reg.MustRegister(m.recordDecodeFails)
                reg.MustRegister(m.samplesSentPreTailing)
                reg.MustRegister(m.currentSegment)
                reg.MustRegister(m.notificationsSkipped)
        }

        return m
}

// NewWatcher creates a new WAL watcher for a given WriteTo.
func NewWatcher(metrics *WatcherMetrics, readerMetrics *LiveReaderMetrics, logger log.Logger, name string, writer WriteTo, dir string, sendExemplars, sendHistograms bool) *Watcher {
        if logger == nil {
                logger = log.NewNopLogger()
        }
        return &Watcher{
                logger:         logger,
                writer:         writer,
                metrics:        metrics,
                readerMetrics:  readerMetrics,
                walDir:         filepath.Join(dir, "wal"),
                name:           name,
                sendExemplars:  sendExemplars,
                sendHistograms: sendHistograms,

                readNotify: make(chan struct{}),
                quit:       make(chan struct{}),
                done:       make(chan struct{}),

                MaxSegment: -1,
        }
}

func (w *Watcher) Notify() {
        select {
        case w.readNotify <- struct{}{}:
                return
        default: // default so we can exit
                // we don't need a buffered channel or any buffering since
                // for each notification it recv's the watcher will read until EOF
                w.notificationsSkipped.Inc()
        }
}

func (w *Watcher) setMetrics() {
        // Setup the WAL Watchers metrics. We do this here rather than in the
        // constructor because of the ordering of creating Queue Managers's,
        // stopping them, and then starting new ones in storage/remote/storage.go ApplyConfig.
        if w.metrics != nil {
                w.recordsReadMetric = w.metrics.recordsRead.MustCurryWith(prometheus.Labels{consumer: w.name})
                w.recordDecodeFailsMetric = w.metrics.recordDecodeFails.WithLabelValues(w.name)
                w.samplesSentPreTailing = w.metrics.samplesSentPreTailing.WithLabelValues(w.name)
                w.currentSegmentMetric = w.metrics.currentSegment.WithLabelValues(w.name)
                w.notificationsSkipped = w.metrics.notificationsSkipped.WithLabelValues(w.name)
        }
}

// Start the Watcher.
func (w *Watcher) Start() {
        w.setMetrics()
        level.Info(w.logger).Log("msg", "Starting WAL watcher", "queue", w.name)

        go w.loop()
}

// Stop the Watcher.
func (w *Watcher) Stop() {
        close(w.quit)
        <-w.done

        // Records read metric has series and samples.
        if w.metrics != nil {
                w.metrics.recordsRead.DeleteLabelValues(w.name, "series")
                w.metrics.recordsRead.DeleteLabelValues(w.name, "samples")
                w.metrics.recordDecodeFails.DeleteLabelValues(w.name)
                w.metrics.samplesSentPreTailing.DeleteLabelValues(w.name)
                w.metrics.currentSegment.DeleteLabelValues(w.name)
        }

        level.Info(w.logger).Log("msg", "WAL watcher stopped", "queue", w.name)
}

func (w *Watcher) loop() {
        defer close(w.done)

        // We may encounter failures processing the WAL; we should wait and retry.
        for !isClosed(w.quit) {
                w.SetStartTime(time.Now())
                if err := w.Run(); err != nil {
                        level.Error(w.logger).Log("msg", "error tailing WAL", "err", err)
                }

                select {
                case <-w.quit:
                        return
                case <-time.After(5 * time.Second):
                }
        }
}

// Run the watcher, which will tail the WAL until the quit channel is closed
// or an error case is hit.
func (w *Watcher) Run() error {
        // We want to ensure this is false across iterations since
        // Run will be called again if there was a failure to read the WAL.
        w.sendSamples = false

        level.Info(w.logger).Log("msg", "Replaying WAL", "queue", w.name)

        // Backfill from the checkpoint first if it exists.
        lastCheckpoint, checkpointIndex, err := LastCheckpoint(w.walDir)
        if err != nil && !errors.Is(err, record.ErrNotFound) {
                return fmt.Errorf("tsdb.LastCheckpoint: %w", err)
        }

        if err == nil {
                if err = w.readCheckpoint(lastCheckpoint, (*Watcher).readSegment); err != nil {
                        return fmt.Errorf("readCheckpoint: %w", err)
                }
        }
        w.lastCheckpoint = lastCheckpoint

        currentSegment, err := w.findSegmentForIndex(checkpointIndex)
        if err != nil {
                return err
        }

        level.Debug(w.logger).Log("msg", "Tailing WAL", "lastCheckpoint", lastCheckpoint, "checkpointIndex", checkpointIndex, "currentSegment", currentSegment)
        for !isClosed(w.quit) {
                w.currentSegmentMetric.Set(float64(currentSegment))

                // Re-check on each iteration in case a new segment was added,
                // because watch() will wait for notifications on the last segment.
                _, lastSegment, err := w.firstAndLast()
                if err != nil {
                        return fmt.Errorf("wal.Segments: %w", err)
                }
                tail := currentSegment >= lastSegment

                level.Debug(w.logger).Log("msg", "Processing segment", "currentSegment", currentSegment, "lastSegment", lastSegment)
                if err := w.watch(currentSegment, tail); err != nil && !errors.Is(err, ErrIgnorable) {
                        return err
                }

                // For testing: stop when you hit a specific segment.
                if currentSegment == w.MaxSegment {
                        return nil
                }

                currentSegment++
        }

        return nil
}

// findSegmentForIndex finds the first segment greater than or equal to index.
func (w *Watcher) findSegmentForIndex(index int) (int, error) {
        refs, err := w.segments(w.walDir)
        if err != nil {
                return -1, err
        }

        for _, r := range refs {
                if r >= index {
                        return r, nil
                }
        }

        return -1, errors.New("failed to find segment for index")
}

func (w *Watcher) firstAndLast() (int, int, error) {
        refs, err := w.segments(w.walDir)
        if err != nil {
                return -1, -1, err
        }

        if len(refs) == 0 {
                return -1, -1, nil
        }
        return refs[0], refs[len(refs)-1], nil
}

// Copied from tsdb/wlog/wlog.go so we do not have to open a WAL.
// Plan is to move WAL watcher to TSDB and dedupe these implementations.
func (w *Watcher) segments(dir string) ([]int, error) {
        files, err := os.ReadDir(dir)
        if err != nil {
                return nil, err
        }

        var refs []int
        for _, f := range files {
                k, err := strconv.Atoi(f.Name())
                if err != nil {
                        continue
                }
                refs = append(refs, k)
        }
        slices.Sort(refs)
        for i := 0; i < len(refs)-1; i++ {
                if refs[i]+1 != refs[i+1] {
                        return nil, errors.New("segments are not sequential")
                }
        }
        return refs, nil
}

func (w *Watcher) readAndHandleError(r *LiveReader, segmentNum int, tail bool, size int64) error {
        err := w.readSegment(r, segmentNum, tail)

        // Ignore all errors reading to end of segment whilst replaying the WAL.
        if !tail {
                if err != nil && !errors.Is(err, io.EOF) {
                        level.Warn(w.logger).Log("msg", "Ignoring error reading to end of segment, may have dropped data", "segment", segmentNum, "err", err)
                } else if r.Offset() != size {
                        level.Warn(w.logger).Log("msg", "Expected to have read whole segment, may have dropped data", "segment", segmentNum, "read", r.Offset(), "size", size)
                }
                return ErrIgnorable
        }

        // Otherwise, when we are tailing, non-EOFs are fatal.
        if err != nil && !errors.Is(err, io.EOF) {
                return err
        }
        return nil
}

// Use tail true to indicate that the reader is currently on a segment that is
// actively being written to. If false, assume it's a full segment and we're
// replaying it on start to cache the series records.
func (w *Watcher) watch(segmentNum int, tail bool) error {
        segment, err := OpenReadSegment(SegmentName(w.walDir, segmentNum))
        if err != nil {
                return err
        }
        defer segment.Close()

        reader := NewLiveReader(w.logger, w.readerMetrics, segment)

        size := int64(math.MaxInt64)
        if !tail {
                var err error
                size, err = getSegmentSize(w.walDir, segmentNum)
                if err != nil {
                        return fmt.Errorf("getSegmentSize: %w", err)
                }

                return w.readAndHandleError(reader, segmentNum, tail, size)
        }

        checkpointTicker := time.NewTicker(checkpointPeriod)
        defer checkpointTicker.Stop()

        segmentTicker := time.NewTicker(segmentCheckPeriod)
        defer segmentTicker.Stop()

        readTicker := time.NewTicker(readTimeout)
        defer readTicker.Stop()

        gcSem := make(chan struct{}, 1)
        for {
                select {
                case <-w.quit:
                        return nil

                case <-checkpointTicker.C:
                        // Periodically check if there is a new checkpoint so we can garbage
                        // collect labels. As this is considered an optimisation, we ignore
                        // errors during checkpoint processing. Doing the process asynchronously
                        // allows the current WAL segment to be processed while reading the
                        // checkpoint.
                        select {
                        case gcSem <- struct{}{}:
                                go func() {
                                        defer func() {
                                                <-gcSem
                                        }()
                                        if err := w.garbageCollectSeries(segmentNum); err != nil {
                                                level.Warn(w.logger).Log("msg", "Error process checkpoint", "err", err)
                                        }
                                }()
                        default:
                                // Currently doing a garbage collect, try again later.
                        }

                case <-segmentTicker.C:
                        _, last, err := w.firstAndLast()
                        if err != nil {
                                return fmt.Errorf("segments: %w", err)
                        }

                        // Check if new segments exists.
                        if last <= segmentNum {
                                continue
                        }
                        err = w.readSegment(reader, segmentNum, tail)

                        // Ignore errors reading to end of segment whilst replaying the WAL.
                        if !tail {
                                switch {
                                case err != nil && !errors.Is(err, io.EOF):
                                        level.Warn(w.logger).Log("msg", "Ignoring error reading to end of segment, may have dropped data", "err", err)
                                case reader.Offset() != size:
                                        level.Warn(w.logger).Log("msg", "Expected to have read whole segment, may have dropped data", "segment", segmentNum, "read", reader.Offset(), "size", size)
                                }
                                return nil
                        }

                        // Otherwise, when we are tailing, non-EOFs are fatal.
                        if err != nil && !errors.Is(err, io.EOF) {
                                return err
                        }

                        return nil

                // we haven't read due to a notification in quite some time, try reading anyways
                case <-readTicker.C:
                        level.Debug(w.logger).Log("msg", "Watcher is reading the WAL due to timeout, haven't received any write notifications recently", "timeout", readTimeout)
                        err := w.readAndHandleError(reader, segmentNum, tail, size)
                        if err != nil {
                                return err
                        }
                        // still want to reset the ticker so we don't read too often
                        readTicker.Reset(readTimeout)

                case <-w.readNotify:
                        err := w.readAndHandleError(reader, segmentNum, tail, size)
                        if err != nil {
                                return err
                        }
                        // still want to reset the ticker so we don't read too often
                        readTicker.Reset(readTimeout)
                }
        }
}

func (w *Watcher) garbageCollectSeries(segmentNum int) error {
        dir, _, err := LastCheckpoint(w.walDir)
        if err != nil && !errors.Is(err, record.ErrNotFound) {
                return fmt.Errorf("tsdb.LastCheckpoint: %w", err)
        }

        if dir == "" || dir == w.lastCheckpoint {
                return nil
        }
        w.lastCheckpoint = dir

        index, err := checkpointNum(dir)
        if err != nil {
                return fmt.Errorf("error parsing checkpoint filename: %w", err)
        }

        if index >= segmentNum {
                level.Debug(w.logger).Log("msg", "Current segment is behind the checkpoint, skipping reading of checkpoint", "current", fmt.Sprintf("%08d", segmentNum), "checkpoint", dir)
                return nil
        }

        level.Debug(w.logger).Log("msg", "New checkpoint detected", "new", dir, "currentSegment", segmentNum)

        if err = w.readCheckpoint(dir, (*Watcher).readSegmentForGC); err != nil {
                return fmt.Errorf("readCheckpoint: %w", err)
        }

        // Clear series with a checkpoint or segment index # lower than the checkpoint we just read.
        w.writer.SeriesReset(index)
        return nil
}

// Read from a segment and pass the details to w.writer.
// Also used with readCheckpoint - implements segmentReadFn.
func (w *Watcher) readSegment(r *LiveReader, segmentNum int, tail bool) error {
        var (
                dec                   = record.NewDecoder(labels.NewSymbolTable()) // One table per WAL segment means it won't grow indefinitely.
                series                []record.RefSeries
                samples               []record.RefSample
                samplesToSend         []record.RefSample
                exemplars             []record.RefExemplar
                histograms            []record.RefHistogramSample
                histogramsToSend      []record.RefHistogramSample
                floatHistograms       []record.RefFloatHistogramSample
                floatHistogramsToSend []record.RefFloatHistogramSample
        )
        for r.Next() && !isClosed(w.quit) {
                rec := r.Record()
                w.recordsReadMetric.WithLabelValues(dec.Type(rec).String()).Inc()

                switch dec.Type(rec) {
                case record.Series:
                        series, err := dec.Series(rec, series[:0])
                        if err != nil {
                                w.recordDecodeFailsMetric.Inc()
                                return err
                        }
                        w.writer.StoreSeries(series, segmentNum)

                case record.Samples:
                        // If we're not tailing a segment we can ignore any samples records we see.
                        // This speeds up replay of the WAL by > 10x.
                        if !tail {
                                break
                        }
                        samples, err := dec.Samples(rec, samples[:0])
                        if err != nil {
                                w.recordDecodeFailsMetric.Inc()
                                return err
                        }
                        for _, s := range samples {
                                if s.T > w.startTimestamp {
                                        if !w.sendSamples {
                                                w.sendSamples = true
                                                duration := time.Since(w.startTime)
                                                level.Info(w.logger).Log("msg", "Done replaying WAL", "duration", duration)
                                        }
                                        samplesToSend = append(samplesToSend, s)
                                }
                        }
                        if len(samplesToSend) > 0 {
                                w.writer.Append(samplesToSend)
                                samplesToSend = samplesToSend[:0]
                        }

                case record.Exemplars:
                        // Skip if experimental "exemplars over remote write" is not enabled.
                        if !w.sendExemplars {
                                break
                        }
                        // If we're not tailing a segment we can ignore any exemplars records we see.
                        // This speeds up replay of the WAL significantly.
                        if !tail {
                                break
                        }
                        exemplars, err := dec.Exemplars(rec, exemplars[:0])
                        if err != nil {
                                w.recordDecodeFailsMetric.Inc()
                                return err
                        }
                        w.writer.AppendExemplars(exemplars)

                case record.HistogramSamples:
                        // Skip if experimental "histograms over remote write" is not enabled.
                        if !w.sendHistograms {
                                break
                        }
                        if !tail {
                                break
                        }
                        histograms, err := dec.HistogramSamples(rec, histograms[:0])
                        if err != nil {
                                w.recordDecodeFailsMetric.Inc()
                                return err
                        }
                        for _, h := range histograms {
                                if h.T > w.startTimestamp {
                                        if !w.sendSamples {
                                                w.sendSamples = true
                                                duration := time.Since(w.startTime)
                                                level.Info(w.logger).Log("msg", "Done replaying WAL", "duration", duration)
                                        }
                                        histogramsToSend = append(histogramsToSend, h)
                                }
                        }
                        if len(histogramsToSend) > 0 {
                                w.writer.AppendHistograms(histogramsToSend)
                                histogramsToSend = histogramsToSend[:0]
                        }
                case record.FloatHistogramSamples:
                        // Skip if experimental "histograms over remote write" is not enabled.
                        if !w.sendHistograms {
                                break
                        }
                        if !tail {
                                break
                        }
                        floatHistograms, err := dec.FloatHistogramSamples(rec, floatHistograms[:0])
                        if err != nil {
                                w.recordDecodeFailsMetric.Inc()
                                return err
                        }
                        for _, fh := range floatHistograms {
                                if fh.T > w.startTimestamp {
                                        if !w.sendSamples {
                                                w.sendSamples = true
                                                duration := time.Since(w.startTime)
                                                level.Info(w.logger).Log("msg", "Done replaying WAL", "duration", duration)
                                        }
                                        floatHistogramsToSend = append(floatHistogramsToSend, fh)
                                }
                        }
                        if len(floatHistogramsToSend) > 0 {
                                w.writer.AppendFloatHistograms(floatHistogramsToSend)
                                floatHistogramsToSend = floatHistogramsToSend[:0]
                        }
                case record.Tombstones:

                default:
                        // Could be corruption, or reading from a WAL from a newer Prometheus.
                        w.recordDecodeFailsMetric.Inc()
                }
        }
        if err := r.Err(); err != nil {
                return fmt.Errorf("segment %d: %w", segmentNum, err)
        }
        return nil
}

// Go through all series in a segment updating the segmentNum, so we can delete older series.
// Used with readCheckpoint - implements segmentReadFn.
func (w *Watcher) readSegmentForGC(r *LiveReader, segmentNum int, _ bool) error {
        var (
                dec    = record.NewDecoder(labels.NewSymbolTable()) // Needed for decoding; labels do not outlive this function.
                series []record.RefSeries
        )
        for r.Next() && !isClosed(w.quit) {
                rec := r.Record()
                w.recordsReadMetric.WithLabelValues(dec.Type(rec).String()).Inc()

                switch dec.Type(rec) {
                case record.Series:
                        series, err := dec.Series(rec, series[:0])
                        if err != nil {
                                w.recordDecodeFailsMetric.Inc()
                                return err
                        }
                        w.writer.UpdateSeriesSegment(series, segmentNum)

                // Ignore these; we're only interested in series.
                case record.Samples:
                case record.Exemplars:
                case record.Tombstones:

                default:
                        // Could be corruption, or reading from a WAL from a newer Prometheus.
                        w.recordDecodeFailsMetric.Inc()
                }
        }
        if err := r.Err(); err != nil {
                return fmt.Errorf("segment %d: %w", segmentNum, err)
        }
        return nil
}

func (w *Watcher) SetStartTime(t time.Time) {
        w.startTime = t
        w.startTimestamp = timestamp.FromTime(t)
}

type segmentReadFn func(w *Watcher, r *LiveReader, segmentNum int, tail bool) error

// Read all the series records from a Checkpoint directory.
func (w *Watcher) readCheckpoint(checkpointDir string, readFn segmentReadFn) error {
        level.Debug(w.logger).Log("msg", "Reading checkpoint", "dir", checkpointDir)
        index, err := checkpointNum(checkpointDir)
        if err != nil {
                return fmt.Errorf("checkpointNum: %w", err)
        }

        // Ensure we read the whole contents of every segment in the checkpoint dir.
        segs, err := w.segments(checkpointDir)
        if err != nil {
                return fmt.Errorf("Unable to get segments checkpoint dir: %w", err)
        }
        for _, seg := range segs {
                size, err := getSegmentSize(checkpointDir, seg)
                if err != nil {
                        return fmt.Errorf("getSegmentSize: %w", err)
                }

                sr, err := OpenReadSegment(SegmentName(checkpointDir, seg))
                if err != nil {
                        return fmt.Errorf("unable to open segment: %w", err)
                }
                defer sr.Close()

                r := NewLiveReader(w.logger, w.readerMetrics, sr)
                if err := readFn(w, r, index, false); err != nil && !errors.Is(err, io.EOF) {
                        return fmt.Errorf("readSegment: %w", err)
                }

                if r.Offset() != size {
                        return fmt.Errorf("readCheckpoint wasn't able to read all data from the checkpoint %s/%08d, size: %d, totalRead: %d", checkpointDir, seg, size, r.Offset())
                }
        }

        level.Debug(w.logger).Log("msg", "Read series references from checkpoint", "checkpoint", checkpointDir)
        return nil
}

func checkpointNum(dir string) (int, error) {
        // Checkpoint dir names are in the format checkpoint.000001
        // dir may contain a hidden directory, so only check the base directory
        chunks := strings.Split(filepath.Base(dir), ".")
        if len(chunks) != 2 {
                return 0, fmt.Errorf("invalid checkpoint dir string: %s", dir)
        }

        result, err := strconv.Atoi(chunks[1])
        if err != nil {
                return 0, fmt.Errorf("invalid checkpoint dir string: %s", dir)
        }

        return result, nil
}

// Get size of segment.
func getSegmentSize(dir string, index int) (int64, error) {
        i := int64(-1)
        fi, err := os.Stat(SegmentName(dir, index))
        if err == nil {
                i = fi.Size()
        }
        return i, err
}

func isClosed(c chan struct{}) bool {
        select {
        case <-c:
                return true
        default:
                return false
        }
}

// Copyright 2017 The Prometheus Authors

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wlog

import (
        "bufio"
        "encoding/binary"
        "errors"
        "fmt"
        "hash/crc32"
        "io"
        "os"
        "path/filepath"
        "slices"
        "strconv"
        "sync"
        "time"

        "github.com/go-kit/log"
        "github.com/go-kit/log/level"
        "github.com/golang/snappy"
        "github.com/klauspost/compress/zstd"
        "github.com/prometheus/client_golang/prometheus"

        "github.com/prometheus/prometheus/tsdb/fileutil"
)

const (
        DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB
        pageSize           = 32 * 1024         // 32KB
        recordHeaderSize   = 7
        WblDirName         = "wbl"
)

// The table gets initialized with sync.Once but may still cause a race
// with any other use of the crc32 package anywhere. Thus we initialize it
// before.
var castagnoliTable = crc32.MakeTable(crc32.Castagnoli)

// page is an in memory buffer used to batch disk writes.
// Records bigger than the page size are split and flushed separately.
// A flush is triggered when a single records doesn't fit the page size or
// when the next record can't fit in the remaining free page space.
type page struct {
        alloc   int
        flushed int
        buf     [pageSize]byte
}

func (p *page) remaining() int {
        return pageSize - p.alloc
}

func (p *page) full() bool {
        return pageSize-p.alloc < recordHeaderSize
}

func (p *page) reset() {
        for i := range p.buf {
                p.buf[i] = 0
        }
        p.alloc = 0
        p.flushed = 0
}

// SegmentFile represents the underlying file used to store a segment.
type SegmentFile interface {
        Stat() (os.FileInfo, error)
        Sync() error
        io.Writer
        io.Reader
        io.Closer
}

// Segment represents a segment file.
type Segment struct {
        SegmentFile
        dir string
        i   int
}

// Index returns the index of the segment.
func (s *Segment) Index() int {
        return s.i
}

// Dir returns the directory of the segment.
func (s *Segment) Dir() string {
        return s.dir
}

// CorruptionErr is an error that's returned when corruption is encountered.
type CorruptionErr struct {
        Dir     string
        Segment int
        Offset  int64
        Err     error
}

func (e *CorruptionErr) Error() string {
        if e.Segment < 0 {
                return fmt.Sprintf("corruption after %d bytes: %s", e.Offset, e.Err)
        }
        return fmt.Sprintf("corruption in segment %s at %d: %s", SegmentName(e.Dir, e.Segment), e.Offset, e.Err)
}

func (e *CorruptionErr) Unwrap() error {
        return e.Err
}

// OpenWriteSegment opens segment k in dir. The returned segment is ready for new appends.
func OpenWriteSegment(logger log.Logger, dir string, k int) (*Segment, error) {
        segName := SegmentName(dir, k)
        f, err := os.OpenFile(segName, os.O_WRONLY|os.O_APPEND, 0o666)
        if err != nil {
                return nil, err
        }
        stat, err := f.Stat()
        if err != nil {
                f.Close()
                return nil, err
        }
        // If the last page is torn, fill it with zeros.
        // In case it was torn after all records were written successfully, this
        // will just pad the page and everything will be fine.
        // If it was torn mid-record, a full read (which the caller should do anyway
        // to ensure integrity) will detect it as a corruption by the end.
        if d := stat.Size() % pageSize; d != 0 {
                level.Warn(logger).Log("msg", "Last page of the wlog is torn, filling it with zeros", "segment", segName)
                if _, err := f.Write(make([]byte, pageSize-d)); err != nil {
                        f.Close()
                        return nil, fmt.Errorf("zero-pad torn page: %w", err)
                }
        }
        return &Segment{SegmentFile: f, i: k, dir: dir}, nil
}

// CreateSegment creates a new segment k in dir.
func CreateSegment(dir string, k int) (*Segment, error) {
        f, err := os.OpenFile(SegmentName(dir, k), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0o666)
        if err != nil {
                return nil, err
        }
        return &Segment{SegmentFile: f, i: k, dir: dir}, nil
}

// OpenReadSegment opens the segment with the given filename.
func OpenReadSegment(fn string) (*Segment, error) {
        k, err := strconv.Atoi(filepath.Base(fn))
        if err != nil {
                return nil, errors.New("not a valid filename")
        }
        f, err := os.Open(fn)
        if err != nil {
                return nil, err
        }
        return &Segment{SegmentFile: f, i: k, dir: filepath.Dir(fn)}, nil
}

type CompressionType string

const (
        CompressionNone   CompressionType = "none"
        CompressionSnappy CompressionType = "snappy"
        CompressionZstd   CompressionType = "zstd"
)

// ParseCompressionType parses the two compression-related configuration values and returns the CompressionType. If
// compression is enabled but the compressType is unrecognized, we default to Snappy compression.
func ParseCompressionType(compress bool, compressType string) CompressionType {
        if compress {
                if compressType == "zstd" {
                        return CompressionZstd
                }
                return CompressionSnappy
        }
        return CompressionNone
}

// WL is a write log that stores records in segment files.
// It must be read from start to end once before logging new data.
// If an error occurs during read, the repair procedure must be called
// before it's safe to do further writes.
//
// Segments are written to in pages of 32KB, with records possibly split
// across page boundaries.
// Records are never split across segments to allow full segments to be
// safely truncated. It also ensures that torn writes never corrupt records
// beyond the most recent segment.
type WL struct {
        dir         string
        logger      log.Logger
        segmentSize int
        mtx         sync.RWMutex
        segment     *Segment // Active segment.
        donePages   int      // Pages written to the segment.
        page        *page    // Active page.
        stopc       chan chan struct{}
        actorc      chan func()
        closed      bool // To allow calling Close() more than once without blocking.
        compress    CompressionType
        compressBuf []byte
        zstdWriter  *zstd.Encoder

        WriteNotified WriteNotified

        metrics *wlMetrics
}

type wlMetrics struct {
        fsyncDuration   prometheus.Summary
        pageFlushes     prometheus.Counter
        pageCompletions prometheus.Counter
        truncateFail    prometheus.Counter
        truncateTotal   prometheus.Counter
        currentSegment  prometheus.Gauge
        writesFailed    prometheus.Counter
        walFileSize     prometheus.GaugeFunc

        r prometheus.Registerer
}

func (w *wlMetrics) Unregister() {
        if w.r == nil {
                return
        }
        w.r.Unregister(w.fsyncDuration)
        w.r.Unregister(w.pageFlushes)
        w.r.Unregister(w.pageCompletions)
        w.r.Unregister(w.truncateFail)
        w.r.Unregister(w.truncateTotal)
        w.r.Unregister(w.currentSegment)
        w.r.Unregister(w.writesFailed)
        w.r.Unregister(w.walFileSize)
}

func newWLMetrics(w *WL, r prometheus.Registerer) *wlMetrics {
        m := &wlMetrics{
                r: r,
        }

        m.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
                Name:       "fsync_duration_seconds",
                Help:       "Duration of write log fsync.",
                Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
        })
        m.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "page_flushes_total",
                Help: "Total number of page flushes.",
        })
        m.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "completed_pages_total",
                Help: "Total number of completed pages.",
        })
        m.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "truncations_failed_total",
                Help: "Total number of write log truncations that failed.",
        })
        m.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "truncations_total",
                Help: "Total number of write log truncations attempted.",
        })
        m.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{
                Name: "segment_current",
                Help: "Write log segment index that TSDB is currently writing to.",
        })
        m.writesFailed = prometheus.NewCounter(prometheus.CounterOpts{
                Name: "writes_failed_total",
                Help: "Total number of write log writes that failed.",
        })
        m.walFileSize = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
                Name: "storage_size_bytes",
                Help: "Size of the write log directory.",
        }, func() float64 {
                val, err := w.Size()
                if err != nil {
                        level.Error(w.logger).Log("msg", "Failed to calculate size of \"wal\" dir",
                                "err", err.Error())
                }
                return float64(val)
        })

        if r != nil {
                r.MustRegister(
                        m.fsyncDuration,
                        m.pageFlushes,
                        m.pageCompletions,
                        m.truncateFail,
                        m.truncateTotal,
                        m.currentSegment,
                        m.writesFailed,
                        m.walFileSize,
                )
        }

        return m
}

// New returns a new WAL over the given directory.
func New(logger log.Logger, reg prometheus.Registerer, dir string, compress CompressionType) (*WL, error) {
        return NewSize(logger, reg, dir, DefaultSegmentSize, compress)
}

// NewSize returns a new write log over the given directory.
// New segments are created with the specified size.
func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSize int, compress CompressionType) (*WL, error) {
        if segmentSize%pageSize != 0 {
                return nil, errors.New("invalid segment size")
        }
        if err := os.MkdirAll(dir, 0o777); err != nil {
                return nil, fmt.Errorf("create dir: %w", err)
        }
        if logger == nil {
                logger = log.NewNopLogger()
        }

        var zstdWriter *zstd.Encoder
        if compress == CompressionZstd {
                var err error
                zstdWriter, err = zstd.NewWriter(nil)
                if err != nil {
                        return nil, err
                }
        }

        w := &WL{
                dir:         dir,
                logger:      logger,
                segmentSize: segmentSize,
                page:        &page{},
                actorc:      make(chan func(), 100),
                stopc:       make(chan chan struct{}),
                compress:    compress,
                zstdWriter:  zstdWriter,
        }
        prefix := "prometheus_tsdb_wal_"
        if filepath.Base(dir) == WblDirName {
                prefix = "prometheus_tsdb_out_of_order_wbl_"
        }
        w.metrics = newWLMetrics(w, prometheus.WrapRegistererWithPrefix(prefix, reg))

        _, last, err := Segments(w.Dir())
        if err != nil {
                return nil, fmt.Errorf("get segment range: %w", err)
        }

        // Index of the Segment we want to open and write to.
        writeSegmentIndex := 0
        // If some segments already exist create one with a higher index than the last segment.
        if last != -1 {
                writeSegmentIndex = last + 1
        }

        segment, err := CreateSegment(w.Dir(), writeSegmentIndex)
        if err != nil {
                return nil, err
        }

        if err := w.setSegment(segment); err != nil {
                return nil, err
        }

        go w.run()

        return w, nil
}

// Open an existing WAL.
func Open(logger log.Logger, dir string) (*WL, error) {
        if logger == nil {
                logger = log.NewNopLogger()
        }
        zstdWriter, err := zstd.NewWriter(nil)
        if err != nil {
                return nil, err
        }

        w := &WL{
                dir:        dir,
                logger:     logger,
                zstdWriter: zstdWriter,
        }

        return w, nil
}

// CompressionType returns if compression is enabled on this WAL.
func (w *WL) CompressionType() CompressionType {
        return w.compress
}

// Dir returns the directory of the WAL.
func (w *WL) Dir() string {
        return w.dir
}

func (w *WL) SetWriteNotified(wn WriteNotified) {
        w.WriteNotified = wn
}

func (w *WL) run() {
Loop:
        for {
                select {
                case f := <-w.actorc:
                        f()
                case donec := <-w.stopc:
                        close(w.actorc)
                        defer close(donec)
                        break Loop
                }
        }
        // Drain and process any remaining functions.
        for f := range w.actorc {
                f()
        }
}

// Repair attempts to repair the WAL based on the error.
// It discards all data after the corruption.
func (w *WL) Repair(origErr error) error {
        // We could probably have a mode that only discards torn records right around
        // the corruption to preserve as data much as possible.
        // But that's not generally applicable if the records have any kind of causality.
        // Maybe as an extra mode in the future if mid-WAL corruptions become
        // a frequent concern.
        var cerr *CorruptionErr
        if !errors.As(origErr, &cerr) {
                return fmt.Errorf("cannot handle error: %w", origErr)
        }
        if cerr.Segment < 0 {
                return errors.New("corruption error does not specify position")
        }
        level.Warn(w.logger).Log("msg", "Starting corruption repair",
                "segment", cerr.Segment, "offset", cerr.Offset)

        // All segments behind the corruption can no longer be used.
        segs, err := listSegments(w.Dir())
        if err != nil {
                return fmt.Errorf("list segments: %w", err)
        }
        level.Warn(w.logger).Log("msg", "Deleting all segments newer than corrupted segment", "segment", cerr.Segment)

        for _, s := range segs {
                if w.segment.i == s.index {
                        // The active segment needs to be removed,
                        // close it first (Windows!). Can be closed safely
                        // as we set the current segment to repaired file
                        // below.
                        if err := w.segment.Close(); err != nil {
                                return fmt.Errorf("close active segment: %w", err)
                        }
                }
                if s.index <= cerr.Segment {
                        continue
                }
                if err := os.Remove(filepath.Join(w.Dir(), s.name)); err != nil {
                        return fmt.Errorf("delete segment:%v: %w", s.index, err)
                }
        }
        // Regardless of the corruption offset, no record reaches into the previous segment.
        // So we can safely repair the WAL by removing the segment and re-inserting all
        // its records up to the corruption.
        level.Warn(w.logger).Log("msg", "Rewrite corrupted segment", "segment", cerr.Segment)

        fn := SegmentName(w.Dir(), cerr.Segment)
        tmpfn := fn + ".repair"

        if err := fileutil.Rename(fn, tmpfn); err != nil {
                return err
        }
        // Create a clean segment and make it the active one.
        s, err := CreateSegment(w.Dir(), cerr.Segment)
        if err != nil {
                return err
        }
        if err := w.setSegment(s); err != nil {
                return err
        }

        f, err := os.Open(tmpfn)
        if err != nil {
                return fmt.Errorf("open segment: %w", err)
        }
        defer f.Close()

        r := NewReader(bufio.NewReader(f))

        for r.Next() {
                // Add records only up to the where the error was.
                if r.Offset() >= cerr.Offset {
                        break
                }
                if err := w.Log(r.Record()); err != nil {
                        return fmt.Errorf("insert record: %w", err)
                }
        }
        // We expect an error here from r.Err(), so nothing to handle.

        // We need to pad to the end of the last page in the repaired segment
        if err := w.flushPage(true); err != nil {
                return fmt.Errorf("flush page in repair: %w", err)
        }

        // We explicitly close even when there is a defer for Windows to be
        // able to delete it. The defer is in place to close it in-case there
        // are errors above.
        if err := f.Close(); err != nil {
                return fmt.Errorf("close corrupted file: %w", err)
        }
        if err := os.Remove(tmpfn); err != nil {
                return fmt.Errorf("delete corrupted segment: %w", err)
        }

        // Explicitly close the segment we just repaired to avoid issues with Windows.
        s.Close()

        // We always want to start writing to a new Segment rather than an existing
        // Segment, which is handled by NewSize, but earlier in Repair we're deleting
        // all segments that come after the corrupted Segment. Recreate a new Segment here.
        s, err = CreateSegment(w.Dir(), cerr.Segment+1)
        if err != nil {
                return err
        }
        return w.setSegment(s)
}

// SegmentName builds a segment name for the directory.
func SegmentName(dir string, i int) string {
        return filepath.Join(dir, fmt.Sprintf("%08d", i))
}

// NextSegment creates the next segment and closes the previous one asynchronously.
// It returns the file number of the new file.
func (w *WL) NextSegment() (int, error) {
        w.mtx.Lock()
        defer w.mtx.Unlock()
        return w.nextSegment(true)
}

// NextSegmentSync creates the next segment and closes the previous one in sync.
// It returns the file number of the new file.
func (w *WL) NextSegmentSync() (int, error) {
        w.mtx.Lock()
        defer w.mtx.Unlock()
        return w.nextSegment(false)
}

// nextSegment creates the next segment and closes the previous one.
// It returns the file number of the new file.
func (w *WL) nextSegment(async bool) (int, error) {
        if w.closed {
                return 0, errors.New("wlog is closed")
        }

        // Only flush the current page if it actually holds data.
        if w.page.alloc > 0 {
                if err := w.flushPage(true); err != nil {
                        return 0, err
                }
        }
        next, err := CreateSegment(w.Dir(), w.segment.Index()+1)
        if err != nil {
                return 0, fmt.Errorf("create new segment file: %w", err)
        }
        prev := w.segment
        if err := w.setSegment(next); err != nil {
                return 0, err
        }

        // Don't block further writes by fsyncing the last segment.
        f := func() {
                if err := w.fsync(prev); err != nil {
                        level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
                }
                if err := prev.Close(); err != nil {
                        level.Error(w.logger).Log("msg", "close previous segment", "err", err)
                }
        }
        if async {
                w.actorc <- f
        } else {
                f()
        }
        return next.Index(), nil
}

func (w *WL) setSegment(segment *Segment) error {
        w.segment = segment

        // Correctly initialize donePages.
        stat, err := segment.Stat()
        if err != nil {
                return err
        }
        w.donePages = int(stat.Size() / pageSize)
        w.metrics.currentSegment.Set(float64(segment.Index()))
        return nil
}

// flushPage writes the new contents of the page to disk. If no more records will fit into
// the page, the remaining bytes will be set to zero and a new page will be started.
// If clear is true, this is enforced regardless of how many bytes are left in the page.
func (w *WL) flushPage(clear bool) error {
        w.metrics.pageFlushes.Inc()

        p := w.page
        clear = clear || p.full()

        // No more data will fit into the page or an implicit clear.
        // Enqueue and clear it.
        if clear {
                p.alloc = pageSize // Write till end of page.
        }

        n, err := w.segment.Write(p.buf[p.flushed:p.alloc])
        if err != nil {
                p.flushed += n
                return err
        }
        p.flushed += n

        // We flushed an entire page, prepare a new one.
        if clear {
                p.reset()
                w.donePages++
                w.metrics.pageCompletions.Inc()
        }
        return nil
}

// First Byte of header format:
//
//        [3 bits unallocated] [1 bit zstd compression flag] [1 bit snappy compression flag] [3 bit record type ]
const (
        snappyMask  = 1 << 3
        zstdMask    = 1 << 4
        recTypeMask = snappyMask - 1
)

type recType uint8

const (
        recPageTerm recType = 0 // Rest of page is empty.
        recFull     recType = 1 // Full record.
        recFirst    recType = 2 // First fragment of a record.
        recMiddle   recType = 3 // Middle fragments of a record.
        recLast     recType = 4 // Final fragment of a record.
)

func recTypeFromHeader(header byte) recType {
        return recType(header & recTypeMask)
}

func (t recType) String() string {
        switch t {
        case recPageTerm:
                return "zero"
        case recFull:
                return "full"
        case recFirst:
                return "first"
        case recMiddle:
                return "middle"
        case recLast:
                return "last"
        default:
                return "<invalid>"
        }
}

func (w *WL) pagesPerSegment() int {
        return w.segmentSize / pageSize
}

// Log writes the records into the log.
// Multiple records can be passed at once to reduce writes and increase throughput.
func (w *WL) Log(recs ...[]byte) error {
        w.mtx.Lock()
        defer w.mtx.Unlock()
        // Callers could just implement their own list record format but adding
        // a bit of extra logic here frees them from that overhead.
        for i, r := range recs {
                if err := w.log(r, i == len(recs)-1); err != nil {
                        w.metrics.writesFailed.Inc()
                        return err
                }
        }
        return nil
}

// log writes rec to the log and forces a flush of the current page if:
// - the final record of a batch
// - the record is bigger than the page size
// - the current page is full.
func (w *WL) log(rec []byte, final bool) error {
        // When the last page flush failed the page will remain full.
        // When the page is full, need to flush it before trying to add more records to it.
        if w.page.full() {
                if err := w.flushPage(true); err != nil {
                        return err
                }
        }

        // Compress the record before calculating if a new segment is needed.
        compressed := false
        if w.compress == CompressionSnappy && len(rec) > 0 {
                // If MaxEncodedLen is less than 0 the record is too large to be compressed.
                if len(rec) > 0 && snappy.MaxEncodedLen(len(rec)) >= 0 {
                        // The snappy library uses `len` to calculate if we need a new buffer.
                        // In order to allocate as few buffers as possible make the length
                        // equal to the capacity.
                        w.compressBuf = w.compressBuf[:cap(w.compressBuf)]
                        w.compressBuf = snappy.Encode(w.compressBuf, rec)
                        if len(w.compressBuf) < len(rec) {
                                rec = w.compressBuf
                                compressed = true
                        }
                }
        } else if w.compress == CompressionZstd && len(rec) > 0 {
                w.compressBuf = w.zstdWriter.EncodeAll(rec, w.compressBuf[:0])
                if len(w.compressBuf) < len(rec) {
                        rec = w.compressBuf
                        compressed = true
                }
        }

        // If the record is too big to fit within the active page in the current
        // segment, terminate the active segment and advance to the next one.
        // This ensures that records do not cross segment boundaries.
        left := w.page.remaining() - recordHeaderSize                                   // Free space in the active page.
        left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment.

        if len(rec) > left {
                if _, err := w.nextSegment(true); err != nil {
                        return err
                }
        }

        // Populate as many pages as necessary to fit the record.
        // Be careful to always do one pass to ensure we write zero-length records.
        for i := 0; i == 0 || len(rec) > 0; i++ {
                p := w.page

                // Find how much of the record we can fit into the page.
                var (
                        l    = min(len(rec), (pageSize-p.alloc)-recordHeaderSize)
                        part = rec[:l]
                        buf  = p.buf[p.alloc:]
                        typ  recType
                )

                switch {
                case i == 0 && len(part) == len(rec):
                        typ = recFull
                case len(part) == len(rec):
                        typ = recLast
                case i == 0:
                        typ = recFirst
                default:
                        typ = recMiddle
                }
                if compressed {
                        if w.compress == CompressionSnappy {
                                typ |= snappyMask
                        } else if w.compress == CompressionZstd {
                                typ |= zstdMask
                        }
                }

                buf[0] = byte(typ)
                crc := crc32.Checksum(part, castagnoliTable)
                binary.BigEndian.PutUint16(buf[1:], uint16(len(part)))
                binary.BigEndian.PutUint32(buf[3:], crc)

                copy(buf[recordHeaderSize:], part)
                p.alloc += len(part) + recordHeaderSize

                if w.page.full() {
                        if err := w.flushPage(true); err != nil {
                                // TODO When the flushing fails at this point and the record has not been
                                // fully written to the buffer, we end up with a corrupted WAL because some part of the
                                // record have been written to the buffer, while the rest of the record will be discarded.
                                return err
                        }
                }
                rec = rec[l:]
        }

        // If it's the final record of the batch and the page is not empty, flush it.
        if final && w.page.alloc > 0 {
                if err := w.flushPage(false); err != nil {
                        return err
                }
        }

        return nil
}

// LastSegmentAndOffset returns the last segment number of the WAL
// and the offset in that file upto which the segment has been filled.
func (w *WL) LastSegmentAndOffset() (seg, offset int, err error) {
        w.mtx.Lock()
        defer w.mtx.Unlock()

        _, seg, err = Segments(w.Dir())
        if err != nil {
                return
        }

        offset = (w.donePages * pageSize) + w.page.alloc

        return
}

// Truncate drops all segments before i.
func (w *WL) Truncate(i int) (err error) {
        w.metrics.truncateTotal.Inc()
        defer func() {
                if err != nil {
                        w.metrics.truncateFail.Inc()
                }
        }()
        refs, err := listSegments(w.Dir())
        if err != nil {
                return err
        }
        for _, r := range refs {
                if r.index >= i {
                        break
                }
                if err = os.Remove(filepath.Join(w.Dir(), r.name)); err != nil {
                        return err
                }
        }
        return nil
}

func (w *WL) fsync(f *Segment) error {
        start := time.Now()
        err := f.Sync()
        w.metrics.fsyncDuration.Observe(time.Since(start).Seconds())
        return err
}

// Sync forces a file sync on the current write log segment. This function is meant
// to be used only on tests due to different behaviour on Operating Systems
// like windows and linux.
func (w *WL) Sync() error {
        return w.fsync(w.segment)
}

// Close flushes all writes and closes active segment.
func (w *WL) Close() (err error) {
        w.mtx.Lock()
        defer w.mtx.Unlock()

        if w.closed {
                return errors.New("wlog already closed")
        }

        if w.segment == nil {
                w.closed = true
                return nil
        }

        // Flush the last page and zero out all its remaining size.
        // We must not flush an empty page as it would falsely signal
        // the segment is done if we start writing to it again after opening.
        if w.page.alloc > 0 {
                if err := w.flushPage(true); err != nil {
                        return err
                }
        }

        donec := make(chan struct{})
        w.stopc <- donec
        <-donec

        if err = w.fsync(w.segment); err != nil {
                level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
        }
        if err := w.segment.Close(); err != nil {
                level.Error(w.logger).Log("msg", "close previous segment", "err", err)
        }

        w.metrics.Unregister()
        w.closed = true
        return nil
}

// Segments returns the range [first, n] of currently existing segments.
// If no segments are found, first and n are -1.
func Segments(wlDir string) (first, last int, err error) {
        refs, err := listSegments(wlDir)
        if err != nil {
                return 0, 0, err
        }
        if len(refs) == 0 {
                return -1, -1, nil
        }
        return refs[0].index, refs[len(refs)-1].index, nil
}

type segmentRef struct {
        name  string
        index int
}

func listSegments(dir string) (refs []segmentRef, err error) {
        files, err := os.ReadDir(dir)
        if err != nil {
                return nil, err
        }
        for _, f := range files {
                fn := f.Name()
                k, err := strconv.Atoi(fn)
                if err != nil {
                        continue
                }
                refs = append(refs, segmentRef{name: fn, index: k})
        }
        slices.SortFunc(refs, func(a, b segmentRef) int {
                return a.index - b.index
        })
        for i := 0; i < len(refs)-1; i++ {
                if refs[i].index+1 != refs[i+1].index {
                        return nil, errors.New("segments are not sequential")
                }
        }
        return refs, nil
}

// SegmentRange groups segments by the directory and the first and last index it includes.
type SegmentRange struct {
        Dir         string
        First, Last int
}

// NewSegmentsReader returns a new reader over all segments in the directory.
func NewSegmentsReader(dir string) (io.ReadCloser, error) {
        return NewSegmentsRangeReader(SegmentRange{dir, -1, -1})
}

// NewSegmentsRangeReader returns a new reader over the given WAL segment ranges.
// If first or last are -1, the range is open on the respective end.
func NewSegmentsRangeReader(sr ...SegmentRange) (io.ReadCloser, error) {
        var segs []*Segment

        for _, sgmRange := range sr {
                refs, err := listSegments(sgmRange.Dir)
                if err != nil {
                        return nil, fmt.Errorf("list segment in dir:%v: %w", sgmRange.Dir, err)
                }

                for _, r := range refs {
                        if sgmRange.First >= 0 && r.index < sgmRange.First {
                                continue
                        }
                        if sgmRange.Last >= 0 && r.index > sgmRange.Last {
                                break
                        }
                        s, err := OpenReadSegment(filepath.Join(sgmRange.Dir, r.name))
                        if err != nil {
                                return nil, fmt.Errorf("open segment:%v in dir:%v: %w", r.name, sgmRange.Dir, err)
                        }
                        segs = append(segs, s)
                }
        }
        return NewSegmentBufReader(segs...), nil
}

// segmentBufReader is a buffered reader that reads in multiples of pages.
// The main purpose is that we are able to track segment and offset for
// corruption reporting.  We have to be careful not to increment curr too
// early, as it is used by Reader.Err() to tell Repair which segment is corrupt.
// As such we pad the end of non-page align segments with zeros.
type segmentBufReader struct {
        buf  *bufio.Reader
        segs []*Segment
        cur  int // Index into segs.
        off  int // Offset of read data into current segment.
}

func NewSegmentBufReader(segs ...*Segment) io.ReadCloser {
        if len(segs) == 0 {
                return &segmentBufReader{}
        }

        return &segmentBufReader{
                buf:  bufio.NewReaderSize(segs[0], 16*pageSize),
                segs: segs,
        }
}

func NewSegmentBufReaderWithOffset(offset int, segs ...*Segment) (io.ReadCloser, error) {
        if offset == 0 || len(segs) == 0 {
                return NewSegmentBufReader(segs...), nil
        }

        sbr := &segmentBufReader{
                buf:  bufio.NewReaderSize(segs[0], 16*pageSize),
                segs: segs,
        }
        var err error
        if offset > 0 {
                _, err = sbr.buf.Discard(offset)
        }
        return sbr, err
}

func (r *segmentBufReader) Close() (err error) {
        for _, s := range r.segs {
                if e := s.Close(); e != nil {
                        err = e
                }
        }
        return err
}

// Read implements io.Reader.
func (r *segmentBufReader) Read(b []byte) (n int, err error) {
        if len(r.segs) == 0 {
                return 0, io.EOF
        }

        n, err = r.buf.Read(b)
        r.off += n

        // If we succeeded, or hit a non-EOF, we can stop.
        if err == nil || !errors.Is(err, io.EOF) {
                return n, err
        }

        // We hit EOF; fake out zero padding at the end of short segments, so we
        // don't increment curr too early and report the wrong segment as corrupt.
        if r.off%pageSize != 0 {
                i := 0
                for ; n+i < len(b) && (r.off+i)%pageSize != 0; i++ {
                        b[n+i] = 0
                }

                // Return early, even if we didn't fill b.
                r.off += i
                return n + i, nil
        }

        // There is no more deta left in the curr segment and there are no more
        // segments left.  Return EOF.
        if r.cur+1 >= len(r.segs) {
                return n, io.EOF
        }

        // Move to next segment.
        r.cur++
        r.off = 0
        r.buf.Reset(r.segs[r.cur])
        return n, nil
}

// Size computes the size of the write log.
// We do this by adding the sizes of all the files under the WAL dir.
func (w *WL) Size() (int64, error) {
        return fileutil.DirSize(w.Dir())
}

// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package almost

import "math"

var minNormal = math.Float64frombits(0x0010000000000000) // The smallest positive normal value of type float64.

// Equal returns true if a and b differ by less than their sum
// multiplied by epsilon.
func Equal(a, b, epsilon float64) bool {
        // NaN has no equality but for testing we still want to know whether both values
        // are NaN.
        if math.IsNaN(a) && math.IsNaN(b) {
                return true
        }

        // Cf. http://floating-point-gui.de/errors/comparison/
        if a == b {
                return true
        }

        absSum := math.Abs(a) + math.Abs(b)
        diff := math.Abs(a - b)

        if a == 0 || b == 0 || absSum < minNormal {
                return diff < epsilon*minNormal
        }
        return diff/math.Min(absSum, math.MaxFloat64) < epsilon
}

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package annotations

import (
        "errors"
        "fmt"

        "github.com/prometheus/common/model"

        "github.com/prometheus/prometheus/promql/parser/posrange"
)

// Annotations is a general wrapper for warnings and other information
// that is returned by the query API along with the results.
// Each individual annotation is modeled by a Go error.
// They are deduplicated based on the string returned by error.Error().
// The zero value is usable without further initialization, see New().
type Annotations map[string]error

// New returns new Annotations ready to use. Note that the zero value of
// Annotations is also fully usable, but using this method is often more
// readable.
func New() *Annotations {
        return &Annotations{}
}

// Add adds an annotation (modeled as a Go error) in-place and returns the
// modified Annotations for convenience.
func (a *Annotations) Add(err error) Annotations {
        if *a == nil {
                *a = Annotations{}
        }
        (*a)[err.Error()] = err
        return *a
}

// Merge adds the contents of the second annotation to the first, modifying
// the first in-place, and returns the merged first Annotation for convenience.
func (a *Annotations) Merge(aa Annotations) Annotations {
        if *a == nil {
                if aa == nil {
                        return nil
                }
                *a = Annotations{}
        }
        for key, val := range aa {
                (*a)[key] = val
        }
        return *a
}

// AsErrors is a convenience function to return the annotations map as a slice
// of errors.
func (a Annotations) AsErrors() []error {
        arr := make([]error, 0, len(a))
        for _, err := range a {
                arr = append(arr, err)
        }
        return arr
}

// AsStrings is a convenience function to return the annotations map as a slice
// of strings. The query string is used to get the line number and character offset
// positioning info of the elements which trigger an annotation. We limit the number
// of annotations returned here with maxAnnos (0 for no limit).
func (a Annotations) AsStrings(query string, maxAnnos int) []string {
        arr := make([]string, 0, len(a))
        for _, err := range a {
                if maxAnnos > 0 && len(arr) >= maxAnnos {
                        break
                }
                var anErr annoErr
                if errors.As(err, &anErr) {
                        anErr.Query = query
                        err = anErr
                }
                arr = append(arr, err.Error())
        }
        if maxAnnos > 0 && len(a) > maxAnnos {
                arr = append(arr, fmt.Sprintf("%d more annotations omitted", len(a)-maxAnnos))
        }
        return arr
}

func (a Annotations) CountWarningsAndInfo() (int, int) {
        var countWarnings, countInfo int
        for _, err := range a {
                if errors.Is(err, PromQLWarning) {
                        countWarnings++
                }
                if errors.Is(err, PromQLInfo) {
                        countInfo++
                }
        }
        return countWarnings, countInfo
}

//nolint:revive // error-naming.
var (
        // Currently there are only 2 types, warnings and info.
        // For now, info are visually identical with warnings as we have not updated
        // the API spec or the frontend to show a different kind of warning. But we
        // make the distinction here to prepare for adding them in future.
        PromQLInfo    = errors.New("PromQL info")
        PromQLWarning = errors.New("PromQL warning")

        InvalidQuantileWarning                     = fmt.Errorf("%w: quantile value should be between 0 and 1", PromQLWarning)
        BadBucketLabelWarning                      = fmt.Errorf("%w: bucket label %q is missing or has a malformed value", PromQLWarning, model.BucketLabel)
        MixedFloatsHistogramsWarning               = fmt.Errorf("%w: encountered a mix of histograms and floats for", PromQLWarning)
        MixedClassicNativeHistogramsWarning        = fmt.Errorf("%w: vector contains a mix of classic and native histograms for metric name", PromQLWarning)
        NativeHistogramNotCounterWarning           = fmt.Errorf("%w: this native histogram metric is not a counter:", PromQLWarning)
        NativeHistogramNotGaugeWarning             = fmt.Errorf("%w: this native histogram metric is not a gauge:", PromQLWarning)
        MixedExponentialCustomHistogramsWarning    = fmt.Errorf("%w: vector contains a mix of histograms with exponential and custom buckets schemas for metric name", PromQLWarning)
        IncompatibleCustomBucketsHistogramsWarning = fmt.Errorf("%w: vector contains histograms with incompatible custom buckets for metric name", PromQLWarning)

        PossibleNonCounterInfo                  = fmt.Errorf("%w: metric might not be a counter, name does not end in _total/_sum/_count/_bucket:", PromQLInfo)
        HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (see https://prometheus.io/docs/prometheus/latest/querying/functions/#histogram_quantile) for metric name", PromQLInfo)
)

type annoErr struct {
        PositionRange posrange.PositionRange
        Err           error
        Query         string
}

func (e annoErr) Error() string {
        if e.Query == "" {
                return e.Err.Error()
        }
        return fmt.Sprintf("%s (%s)", e.Err, e.PositionRange.StartPosInput(e.Query, 0))
}

func (e annoErr) Unwrap() error {
        return e.Err
}

// NewInvalidQuantileWarning is used when the user specifies an invalid quantile
// value, i.e. a float that is outside the range [0, 1] or NaN.
func NewInvalidQuantileWarning(q float64, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w, got %g", InvalidQuantileWarning, q),
        }
}

// NewBadBucketLabelWarning is used when there is an error parsing the bucket label
// of a classic histogram.
func NewBadBucketLabelWarning(metricName, label string, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w of %q for metric name %q", BadBucketLabelWarning, label, metricName),
        }
}

// NewMixedFloatsHistogramsWarning is used when the queried series includes both
// float samples and histogram samples for functions that do not support mixed
// samples.
func NewMixedFloatsHistogramsWarning(metricName string, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w metric name %q", MixedFloatsHistogramsWarning, metricName),
        }
}

// NewMixedFloatsHistogramsAggWarning is used when the queried series includes both
// float samples and histogram samples in an aggregation.
func NewMixedFloatsHistogramsAggWarning(pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w aggregation", MixedFloatsHistogramsWarning),
        }
}

// NewMixedClassicNativeHistogramsWarning is used when the queried series includes
// both classic and native histograms.
func NewMixedClassicNativeHistogramsWarning(metricName string, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w %q", MixedClassicNativeHistogramsWarning, metricName),
        }
}

// NewNativeHistogramNotCounterWarning is used when histogramRate is called
// with isCounter set to true on a gauge histogram.
func NewNativeHistogramNotCounterWarning(metricName string, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w %q", NativeHistogramNotCounterWarning, metricName),
        }
}

// NewNativeHistogramNotGaugeWarning is used when histogramRate is called
// with isCounter set to false on a counter histogram.
func NewNativeHistogramNotGaugeWarning(metricName string, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w %q", NativeHistogramNotGaugeWarning, metricName),
        }
}

// NewMixedExponentialCustomHistogramsWarning is used when the queried series includes
// histograms with both exponential and custom buckets schemas.
func NewMixedExponentialCustomHistogramsWarning(metricName string, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w %q", MixedExponentialCustomHistogramsWarning, metricName),
        }
}

// NewIncompatibleCustomBucketsHistogramsWarning is used when the queried series includes
// custom buckets histograms with incompatible custom bounds.
func NewIncompatibleCustomBucketsHistogramsWarning(metricName string, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w %q", IncompatibleCustomBucketsHistogramsWarning, metricName),
        }
}

// NewPossibleNonCounterInfo is used when a named counter metric with only float samples does not
// have the suffixes _total, _sum, _count, or _bucket.
func NewPossibleNonCounterInfo(metricName string, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w %q", PossibleNonCounterInfo, metricName),
        }
}

// NewHistogramQuantileForcedMonotonicityInfo is used when the input (classic histograms) to
// histogram_quantile needs to be forced to be monotonic.
func NewHistogramQuantileForcedMonotonicityInfo(metricName string, pos posrange.PositionRange) error {
        return annoErr{
                PositionRange: pos,
                Err:           fmt.Errorf("%w %q", HistogramQuantileForcedMonotonicityInfo, metricName),
        }
}

// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stats

import (
        "context"
        "encoding/json"
        "fmt"

        "github.com/prometheus/client_golang/prometheus"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/trace"
)

// QueryTiming identifies the code area or functionality in which time is spent
// during a query.
type QueryTiming int

// Query timings.
const (
        EvalTotalTime QueryTiming = iota
        ResultSortTime
        QueryPreparationTime
        InnerEvalTime
        ExecQueueTime
        ExecTotalTime
)

// Return a string representation of a QueryTiming identifier.
func (s QueryTiming) String() string {
        switch s {
        case EvalTotalTime:
                return "Eval total time"
        case ResultSortTime:
                return "Result sorting time"
        case QueryPreparationTime:
                return "Query preparation time"
        case InnerEvalTime:
                return "Inner eval time"
        case ExecQueueTime:
                return "Exec queue wait time"
        case ExecTotalTime:
                return "Exec total time"
        default:
                return "Unknown query timing"
        }
}

// SpanOperation returns a string representation of a QueryTiming span operation.
func (s QueryTiming) SpanOperation() string {
        switch s {
        case EvalTotalTime:
                return "promqlEval"
        case ResultSortTime:
                return "promqlSort"
        case QueryPreparationTime:
                return "promqlPrepare"
        case InnerEvalTime:
                return "promqlInnerEval"
        case ExecQueueTime:
                return "promqlExecQueue"
        case ExecTotalTime:
                return "promqlExec"
        default:
                return "Unknown query timing"
        }
}

// stepStat represents a single statistic for a given step timestamp.
type stepStat struct {
        T int64
        V int64
}

func (s stepStat) String() string {
        return fmt.Sprintf("%v @[%v]", s.V, s.T)
}

// MarshalJSON implements json.Marshaler.
func (s stepStat) MarshalJSON() ([]byte, error) {
        return json.Marshal([...]interface{}{float64(s.T) / 1000, s.V})
}

// queryTimings with all query timers mapped to durations.
type queryTimings struct {
        EvalTotalTime        float64 `json:"evalTotalTime"`
        ResultSortTime       float64 `json:"resultSortTime"`
        QueryPreparationTime float64 `json:"queryPreparationTime"`
        InnerEvalTime        float64 `json:"innerEvalTime"`
        ExecQueueTime        float64 `json:"execQueueTime"`
        ExecTotalTime        float64 `json:"execTotalTime"`
}

type querySamples struct {
        TotalQueryableSamplesPerStep []stepStat `json:"totalQueryableSamplesPerStep,omitempty"`
        TotalQueryableSamples        int64      `json:"totalQueryableSamples"`
        PeakSamples                  int        `json:"peakSamples"`
}

// BuiltinStats holds the statistics that Prometheus's core gathers.
type BuiltinStats struct {
        Timings queryTimings  `json:"timings,omitempty"`
        Samples *querySamples `json:"samples,omitempty"`
}

// QueryStats holds BuiltinStats and any other stats the particular
// implementation wants to collect.
type QueryStats interface {
        Builtin() BuiltinStats
}

func (s *BuiltinStats) Builtin() BuiltinStats {
        return *s
}

// NewQueryStats makes a QueryStats struct with all QueryTimings found in the
// given TimerGroup.
func NewQueryStats(s *Statistics) QueryStats {
        var (
                qt      queryTimings
                samples *querySamples
                tg      = s.Timers
                sp      = s.Samples
        )

        for s, timer := range tg.TimerGroup.timers {
                switch s {
                case EvalTotalTime:
                        qt.EvalTotalTime = timer.Duration()
                case ResultSortTime:
                        qt.ResultSortTime = timer.Duration()
                case QueryPreparationTime:
                        qt.QueryPreparationTime = timer.Duration()
                case InnerEvalTime:
                        qt.InnerEvalTime = timer.Duration()
                case ExecQueueTime:
                        qt.ExecQueueTime = timer.Duration()
                case ExecTotalTime:
                        qt.ExecTotalTime = timer.Duration()
                }
        }

        if sp != nil {
                samples = &querySamples{
                        TotalQueryableSamples: sp.TotalSamples,
                        PeakSamples:           sp.PeakSamples,
                }
                samples.TotalQueryableSamplesPerStep = sp.totalSamplesPerStepPoints()
        }

        qs := BuiltinStats{Timings: qt, Samples: samples}
        return &qs
}

func (qs *QuerySamples) TotalSamplesPerStepMap() *TotalSamplesPerStep {
        if !qs.EnablePerStepStats {
                return nil
        }

        ts := TotalSamplesPerStep{}
        for _, s := range qs.totalSamplesPerStepPoints() {
                ts[s.T] = int(s.V)
        }
        return &ts
}

func (qs *QuerySamples) totalSamplesPerStepPoints() []stepStat {
        if !qs.EnablePerStepStats {
                return nil
        }

        ts := make([]stepStat, len(qs.TotalSamplesPerStep))
        for i, c := range qs.TotalSamplesPerStep {
                ts[i] = stepStat{T: qs.startTimestamp + int64(i)*qs.interval, V: c}
        }
        return ts
}

// SpanTimer unifies tracing and timing, to reduce repetition.
type SpanTimer struct {
        timer     *Timer
        observers []prometheus.Observer

        span trace.Span
}

func NewSpanTimer(ctx context.Context, operation string, timer *Timer, observers ...prometheus.Observer) (*SpanTimer, context.Context) {
        ctx, span := otel.Tracer("").Start(ctx, operation)
        timer.Start()

        return &SpanTimer{
                timer:     timer,
                observers: observers,

                span: span,
        }, ctx
}

func (s *SpanTimer) Finish() {
        s.timer.Stop()
        s.span.End()

        for _, obs := range s.observers {
                obs.Observe(s.timer.ElapsedTime().Seconds())
        }
}

type Statistics struct {
        Timers  *QueryTimers
        Samples *QuerySamples
}

type QueryTimers struct {
        *TimerGroup
}

type TotalSamplesPerStep map[int64]int

type QuerySamples struct {
        // PeakSamples represent the highest count of samples considered
        // while evaluating a query. It corresponds to the peak value of
        // currentSamples, which is in turn compared against the MaxSamples
        // configured in the engine.
        PeakSamples int

        // TotalSamples represents the total number of samples scanned
        // while evaluating a query.
        TotalSamples int64

        // TotalSamplesPerStep represents the total number of samples scanned
        // per step while evaluating a query. Each step should be identical to the
        // TotalSamples when a step is run as an instant query, which means
        // we intentionally do not account for optimizations that happen inside the
        // range query engine that reduce the actual work that happens.
        TotalSamplesPerStep []int64

        EnablePerStepStats bool
        startTimestamp     int64
        interval           int64
}

type Stats struct {
        TimerStats  *QueryTimers
        SampleStats *QuerySamples
}

func (qs *QuerySamples) InitStepTracking(start, end, interval int64) {
        if !qs.EnablePerStepStats {
                return
        }

        numSteps := int((end-start)/interval) + 1
        qs.TotalSamplesPerStep = make([]int64, numSteps)
        qs.startTimestamp = start
        qs.interval = interval
}

// IncrementSamplesAtStep increments the total samples count. Use this if you know the step index.
func (qs *QuerySamples) IncrementSamplesAtStep(i int, samples int64) {
        if qs == nil {
                return
        }
        qs.TotalSamples += samples

        if qs.TotalSamplesPerStep != nil {
                qs.TotalSamplesPerStep[i] += samples
        }
}

// IncrementSamplesAtTimestamp increments the total samples count. Use this if you only have the corresponding step
// timestamp.
func (qs *QuerySamples) IncrementSamplesAtTimestamp(t, samples int64) {
        if qs == nil {
                return
        }
        qs.TotalSamples += samples

        if qs.TotalSamplesPerStep != nil {
                i := int((t - qs.startTimestamp) / qs.interval)
                qs.TotalSamplesPerStep[i] += samples
        }
}

// UpdatePeak updates the peak number of samples considered in
// the evaluation of a query as used with the MaxSamples limit.
func (qs *QuerySamples) UpdatePeak(samples int) {
        if qs == nil {
                return
        }
        if samples > qs.PeakSamples {
                qs.PeakSamples = samples
        }
}

// UpdatePeakFromSubquery updates the peak number of samples considered
// in a query from its evaluation of a subquery.
func (qs *QuerySamples) UpdatePeakFromSubquery(other *QuerySamples) {
        if qs == nil || other == nil {
                return
        }
        if other.PeakSamples > qs.PeakSamples {
                qs.PeakSamples = other.PeakSamples
        }
}

func NewQueryTimers() *QueryTimers {
        return &QueryTimers{NewTimerGroup()}
}

func NewQuerySamples(enablePerStepStats bool) *QuerySamples {
        qs := QuerySamples{EnablePerStepStats: enablePerStepStats}
        return &qs
}

func (qs *QuerySamples) NewChild() *QuerySamples {
        return NewQuerySamples(false)
}

func (qs *QueryTimers) GetSpanTimer(ctx context.Context, qt QueryTiming, observers ...prometheus.Observer) (*SpanTimer, context.Context) {
        return NewSpanTimer(ctx, qt.SpanOperation(), qs.TimerGroup.GetTimer(qt), observers...)
}

// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stats

import (
        "bytes"
        "fmt"
        "slices"
        "time"
)

// A Timer that can be started and stopped and accumulates the total time it
// was running (the time between Start() and Stop()).
type Timer struct {
        name     fmt.Stringer
        created  int
        start    time.Time
        duration time.Duration
}

// Start the timer.
func (t *Timer) Start() *Timer {
        t.start = time.Now()
        return t
}

// Stop the timer.
func (t *Timer) Stop() {
        t.duration += time.Since(t.start)
}

// ElapsedTime returns the time that passed since starting the timer.
func (t *Timer) ElapsedTime() time.Duration {
        return time.Since(t.start)
}

// Duration returns the duration value of the timer in seconds.
func (t *Timer) Duration() float64 {
        return t.duration.Seconds()
}

// Return a string representation of the Timer.
func (t *Timer) String() string {
        return fmt.Sprintf("%s: %s", t.name, t.duration)
}

// A TimerGroup represents a group of timers relevant to a single query.
type TimerGroup struct {
        timers map[fmt.Stringer]*Timer
}

// NewTimerGroup constructs a new TimerGroup.
func NewTimerGroup() *TimerGroup {
        return &TimerGroup{timers: map[fmt.Stringer]*Timer{}}
}

// GetTimer gets (and creates, if necessary) the Timer for a given code section.
func (t *TimerGroup) GetTimer(name fmt.Stringer) *Timer {
        if timer, exists := t.timers[name]; exists {
                return timer
        }
        timer := &Timer{
                name:    name,
                created: len(t.timers),
        }
        t.timers[name] = timer
        return timer
}

// Return a string representation of a TimerGroup.
func (t *TimerGroup) String() string {
        timers := make([]*Timer, 0, len(t.timers))
        for _, timer := range t.timers {
                timers = append(timers, timer)
        }
        slices.SortFunc(timers, func(a, b *Timer) int { return a.created - b.created })
        result := &bytes.Buffer{}
        for _, timer := range timers {
                fmt.Fprintf(result, "%s\n", timer)
        }
        return result.String()
}

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// NOTE: The functions in this file (Unquote, unquoteChar, contains, unhex)
// have been adapted from the "strconv" package of the Go standard library
// to work for Prometheus-style strings. Go's special-casing for single
// quotes was removed and single quoted strings are now treated the same as
// double-quoted ones.
//
// The original copyright notice from the Go project for these parts is
// reproduced here:
//
// ========================================================================
// Copyright (c) 2009 The Go Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//    * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//    * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//    * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// ========================================================================

package strutil

import (
        "errors"
        "unicode/utf8"
)

// ErrSyntax indicates that a value does not have the right syntax for the target type.
var ErrSyntax = errors.New("invalid syntax")

// Unquote interprets s as a single-quoted, double-quoted, or backquoted
// Prometheus query language string literal, returning the string value that s
// quotes.
func Unquote(s string) (t string, err error) {
        n := len(s)
        if n < 2 {
                return "", ErrSyntax
        }
        quote := s[0]
        if quote != s[n-1] {
                return "", ErrSyntax
        }
        s = s[1 : n-1]

        if quote == '`' {
                if contains(s, '`') {
                        return "", ErrSyntax
                }
                return s, nil
        }
        if quote != '"' && quote != '\'' {
                return "", ErrSyntax
        }
        if contains(s, '\n') {
                return "", ErrSyntax
        }

        // Is it trivial?  Avoid allocation.
        if !contains(s, '\\') && !contains(s, quote) {
                return s, nil
        }

        var runeTmp [utf8.UTFMax]byte
        buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
        for len(s) > 0 {
                c, multibyte, ss, err := unquoteChar(s, quote)
                if err != nil {
                        return "", err
                }
                s = ss
                if c < utf8.RuneSelf || !multibyte {
                        buf = append(buf, byte(c))
                } else {
                        n := utf8.EncodeRune(runeTmp[:], c)
                        buf = append(buf, runeTmp[:n]...)
                }
        }
        return string(buf), nil
}

// unquoteChar decodes the first character or byte in the escaped string
// or character literal represented by the string s.
// It returns four values:
//
//  1. value, the decoded Unicode code point or byte value;
//  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
//  3. tail, the remainder of the string after the character; and
//  4. an error that will be nil if the character is syntactically valid.
//
// The second argument, quote, specifies the type of literal being parsed
// and therefore which escaped quote character is permitted.
// If set to a single quote, it permits the sequence \' and disallows unescaped '.
// If set to a double quote, it permits \" and disallows unescaped ".
// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
func unquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
        // easy cases
        switch c := s[0]; {
        case c == quote && (quote == '\'' || quote == '"'):
                err = ErrSyntax
                return
        case c >= utf8.RuneSelf:
                r, size := utf8.DecodeRuneInString(s)
                return r, true, s[size:], nil
        case c != '\\':
                return rune(s[0]), false, s[1:], nil
        }

        // Hard case: c is backslash.
        if len(s) <= 1 {
                err = ErrSyntax
                return
        }
        c := s[1]
        s = s[2:]

        switch c {
        case 'a':
                value = '\a'
        case 'b':
                value = '\b'
        case 'f':
                value = '\f'
        case 'n':
                value = '\n'
        case 'r':
                value = '\r'
        case 't':
                value = '\t'
        case 'v':
                value = '\v'
        case 'x', 'u', 'U':
                n := 0
                switch c {
                case 'x':
                        n = 2
                case 'u':
                        n = 4
                case 'U':
                        n = 8
                }
                var v rune
                if len(s) < n {
                        err = ErrSyntax
                        return
                }
                for j := 0; j < n; j++ {
                        x, ok := unhex(s[j])
                        if !ok {
                                err = ErrSyntax
                                return
                        }
                        v = v<<4 | x
                }
                s = s[n:]
                if c == 'x' {
                        // Single-byte string, possibly not UTF-8.
                        value = v
                        break
                }
                if v > utf8.MaxRune {
                        err = ErrSyntax
                        return
                }
                value = v
                multibyte = true
        case '0', '1', '2', '3', '4', '5', '6', '7':
                v := rune(c) - '0'
                if len(s) < 2 {
                        err = ErrSyntax
                        return
                }
                for j := 0; j < 2; j++ { // One digit already; two more.
                        x := rune(s[j]) - '0'
                        if x < 0 || x > 7 {
                                err = ErrSyntax
                                return
                        }
                        v = (v << 3) | x
                }
                s = s[2:]
                if v > 255 {
                        err = ErrSyntax
                        return
                }
                value = v
        case '\\':
                value = '\\'
        case '\'', '"':
                if c != quote {
                        err = ErrSyntax
                        return
                }
                value = rune(c)
        default:
                err = ErrSyntax
                return
        }
        tail = s
        return
}

// contains reports whether the string contains the byte c.
func contains(s string, c byte) bool {
        for i := 0; i < len(s); i++ {
                if s[i] == c {
                        return true
                }
        }
        return false
}

func unhex(b byte) (v rune, ok bool) {
        c := rune(b)
        switch {
        case '0' <= c && c <= '9':
                return c - '0', true
        case 'a' <= c && c <= 'f':
                return c - 'a' + 10, true
        case 'A' <= c && c <= 'F':
                return c - 'A' + 10, true
        }
        return
}

// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package strutil

import (
        "fmt"
        "net/url"
        "strings"

        "github.com/grafana/regexp"
)

var invalidLabelCharRE = regexp.MustCompile(`[^a-zA-Z0-9_]`)

// TableLinkForExpression creates an escaped relative link to the table view of
// the provided expression.
func TableLinkForExpression(expr string) string {
        escapedExpression := url.QueryEscape(expr)
        return fmt.Sprintf("/graph?g0.expr=%s&g0.tab=1", escapedExpression)
}

// GraphLinkForExpression creates an escaped relative link to the graph view of
// the provided expression.
func GraphLinkForExpression(expr string) string {
        escapedExpression := url.QueryEscape(expr)
        return fmt.Sprintf("/graph?g0.expr=%s&g0.tab=0", escapedExpression)
}

// SanitizeLabelName replaces anything that doesn't match
// client_label.LabelNameRE with an underscore.
// Note: this does not handle all Prometheus label name restrictions (such as
// not starting with a digit 0-9), and hence should only be used if the label
// name is prefixed with a known valid string.
func SanitizeLabelName(name string) string {
        return invalidLabelCharRE.ReplaceAllString(name, "_")
}

// SanitizeFullLabelName replaces any invalid character with an underscore, and
// if given an empty string, returns a string containing a single underscore.
func SanitizeFullLabelName(name string) string {
        if len(name) == 0 {
                return "_"
        }
        var validSb strings.Builder
        for i, b := range name {
                if !((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || (b >= '0' && b <= '9' && i > 0)) {
                        validSb.WriteRune('_')
                } else {
                        validSb.WriteRune(b)
                }
        }
        return validSb.String()
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package teststorage

import (
        "fmt"
        "os"
        "time"

        "github.com/prometheus/client_golang/prometheus"
        "github.com/stretchr/testify/require"

        "github.com/prometheus/prometheus/model/exemplar"
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/storage"
        "github.com/prometheus/prometheus/tsdb"
        "github.com/prometheus/prometheus/util/testutil"
)

// New returns a new TestStorage for testing purposes
// that removes all associated files on closing.
func New(t testutil.T) *TestStorage {
        stor, err := NewWithError()
        require.NoError(t, err)
        return stor
}

// NewWithError returns a new TestStorage for user facing tests, which reports
// errors directly.
func NewWithError() (*TestStorage, error) {
        dir, err := os.MkdirTemp("", "test_storage")
        if err != nil {
                return nil, fmt.Errorf("opening test directory: %w", err)
        }

        // Tests just load data for a series sequentially. Thus we
        // need a long appendable window.
        opts := tsdb.DefaultOptions()
        opts.MinBlockDuration = int64(24 * time.Hour / time.Millisecond)
        opts.MaxBlockDuration = int64(24 * time.Hour / time.Millisecond)
        opts.RetentionDuration = 0
        opts.EnableNativeHistograms = true
        db, err := tsdb.Open(dir, nil, nil, opts, tsdb.NewDBStats())
        if err != nil {
                return nil, fmt.Errorf("opening test storage: %w", err)
        }
        reg := prometheus.NewRegistry()
        eMetrics := tsdb.NewExemplarMetrics(reg)

        es, err := tsdb.NewCircularExemplarStorage(10, eMetrics)
        if err != nil {
                return nil, fmt.Errorf("opening test exemplar storage: %w", err)
        }
        return &TestStorage{DB: db, exemplarStorage: es, dir: dir}, nil
}

type TestStorage struct {
        *tsdb.DB
        exemplarStorage tsdb.ExemplarStorage
        dir             string
}

func (s TestStorage) Close() error {
        if err := s.DB.Close(); err != nil {
                return err
        }
        return os.RemoveAll(s.dir)
}

func (s TestStorage) ExemplarAppender() storage.ExemplarAppender {
        return s
}

func (s TestStorage) ExemplarQueryable() storage.ExemplarQueryable {
        return s.exemplarStorage
}

func (s TestStorage) AppendExemplar(ref storage.SeriesRef, l labels.Labels, e exemplar.Exemplar) (storage.SeriesRef, error) {
        return ref, s.exemplarStorage.AddExemplar(l, e)
}

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package testutil

import (
        "fmt"
        "testing"

        "github.com/google/go-cmp/cmp"
        "github.com/stretchr/testify/require"

        "github.com/prometheus/prometheus/model/labels"
)

// Replacement for require.Equal using go-cmp adapted for Prometheus data structures, instead of DeepEqual.
func RequireEqual(t testing.TB, expected, actual interface{}, msgAndArgs ...interface{}) {
        t.Helper()
        RequireEqualWithOptions(t, expected, actual, nil, msgAndArgs...)
}

// As RequireEqual but allows extra cmp.Options.
func RequireEqualWithOptions(t testing.TB, expected, actual interface{}, extra []cmp.Option, msgAndArgs ...interface{}) {
        t.Helper()
        options := append([]cmp.Option{cmp.Comparer(labels.Equal)}, extra...)
        if cmp.Equal(expected, actual, options...) {
                return
        }
        diff := cmp.Diff(expected, actual, options...)
        require.Fail(t, fmt.Sprintf("Not equal: \n"+
                "expected: %s\n"+
                "actual  : %s%s", expected, actual, diff), msgAndArgs...)
}

// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package testutil

import (
        "context"
        "time"

        "go.uber.org/atomic"
)

// A MockContext provides a simple stub implementation of a Context.
type MockContext struct {
        Error  error
        DoneCh chan struct{}
}

// Deadline always will return not set.
func (c *MockContext) Deadline() (deadline time.Time, ok bool) {
        return time.Time{}, false
}

// Done returns a read channel for listening to the Done event.
func (c *MockContext) Done() <-chan struct{} {
        return c.DoneCh
}

// Err returns the error, is nil if not set.
func (c *MockContext) Err() error {
        return c.Error
}

// Value ignores the Value and always returns nil.
func (c *MockContext) Value(interface{}) interface{} {
        return nil
}

// MockContextErrAfter is a MockContext that will return an error after a certain
// number of calls to Err().
type MockContextErrAfter struct {
        MockContext
        count     atomic.Uint64
        FailAfter uint64
}

func (c *MockContextErrAfter) Err() error {
        c.count.Inc()
        if c.count.Load() >= c.FailAfter {
                return context.Canceled
        }
        return c.MockContext.Err()
}

func (c *MockContextErrAfter) Count() uint64 {
        return c.count.Load()
}

// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package testutil

import (
        "crypto/sha256"
        "io"
        "os"
        "path/filepath"
        "strconv"
        "testing"

        "github.com/stretchr/testify/require"
)

const (
        // The base directory used for test emissions, which instructs the operating
        // system to use the default temporary directory as the base or TMPDIR
        // environment variable.
        defaultDirectory = ""

        // NilCloser is a no-op Closer.
        NilCloser = nilCloser(true)

        // The number of times that a TemporaryDirectory will retry its removal.
        temporaryDirectoryRemoveRetries = 2
)

type (
        // Closer is the interface that wraps the Close method.
        Closer interface {
                // Close reaps the underlying directory and its children. The directory
                // could be deleted by its users already.
                Close()
        }

        nilCloser bool

        // TemporaryDirectory models a closeable path for transient POSIX disk
        // activities.
        TemporaryDirectory interface {
                Closer

                // Path returns the underlying path for access.
                Path() string
        }

        // temporaryDirectory is kept as a private type due to private fields and
        // their interactions.
        temporaryDirectory struct {
                path   string
                tester T
        }

        callbackCloser struct {
                fn func()
        }

        // T implements the needed methods of testing.TB so that we do not need
        // to actually import testing (which has the side effect of adding all
        // the test flags, which we do not want in non-test binaries even if
        // they make use of these utilities for some reason).
        T interface {
                Errorf(format string, args ...interface{})
                FailNow()
        }
)

func (c nilCloser) Close() {
}

func (c callbackCloser) Close() {
        c.fn()
}

// NewCallbackCloser returns a Closer that calls the provided function upon
// closing.
func NewCallbackCloser(fn func()) Closer {
        return &callbackCloser{
                fn: fn,
        }
}

func (t temporaryDirectory) Close() {
        retries := temporaryDirectoryRemoveRetries
        err := os.RemoveAll(t.path)
        for err != nil && retries > 0 {
                switch {
                case os.IsNotExist(err):
                        err = nil
                default:
                        retries--
                        err = os.RemoveAll(t.path)
                }
        }
        require.NoError(t.tester, err)
}

func (t temporaryDirectory) Path() string {
        return t.path
}

// NewTemporaryDirectory creates a new temporary directory for transient POSIX
// activities.
func NewTemporaryDirectory(name string, t T) (handler TemporaryDirectory) {
        var (
                directory string
                err       error
        )

        directory, err = os.MkdirTemp(defaultDirectory, name)
        require.NoError(t, err)

        handler = temporaryDirectory{
                path:   directory,
                tester: t,
        }

        return
}

// DirHash returns a hash of all files attributes and their content within a directory.
func DirHash(t *testing.T, path string) []byte {
        hash := sha256.New()
        err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
                require.NoError(t, err)

                if info.IsDir() {
                        return nil
                }
                f, err := os.Open(path)
                require.NoError(t, err)
                defer f.Close()

                _, err = io.Copy(hash, f)
                require.NoError(t, err)

                _, err = io.WriteString(hash, strconv.Itoa(int(info.Size())))
                require.NoError(t, err)

                _, err = io.WriteString(hash, info.Name())
                require.NoError(t, err)

                modTime, err := info.ModTime().GobEncode()
                require.NoError(t, err)

                _, err = io.WriteString(hash, string(modTime))
                require.NoError(t, err)
                return nil
        })
        require.NoError(t, err)

        return hash.Sum(nil)
}

// Copyright 2019 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package testutil

import (
        "testing"

        "github.com/go-kit/log"
)

type logger struct {
        t *testing.T
}

// NewLogger returns a gokit compatible Logger which calls t.Log.
func NewLogger(t *testing.T) log.Logger {
        return logger{t: t}
}

// Log implements log.Logger.
func (t logger) Log(keyvals ...interface{}) error {
        t.t.Log(keyvals...)
        return nil
}

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package testutil

import (
        "net"
        "testing"
)

// RandomUnprivilegedPort returns valid unprivileged random port number which can be used for testing.
func RandomUnprivilegedPort(t *testing.T) int {
        t.Helper()

        listener, err := net.Listen("tcp", ":0")
        if err != nil {
                t.Fatalf("Listening on random port: %v", err)
        }

        if err := listener.Close(); err != nil {
                t.Fatalf("Closing listener: %v", err)
        }

        return listener.Addr().(*net.TCPAddr).Port
}

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package testutil

import (
        "net/http"
)

type roundTrip struct {
        theResponse *http.Response
        theError    error
}

func (rt *roundTrip) RoundTrip(*http.Request) (*http.Response, error) {
        return rt.theResponse, rt.theError
}

type roundTripCheckRequest struct {
        checkRequest func(*http.Request)
        roundTrip
}

func (rt *roundTripCheckRequest) RoundTrip(r *http.Request) (*http.Response, error) {
        rt.checkRequest(r)
        return rt.theResponse, rt.theError
}

// NewRoundTripCheckRequest creates a new instance of a type that implements http.RoundTripper,
// which before returning theResponse and theError, executes checkRequest against a http.Request.
func NewRoundTripCheckRequest(checkRequest func(*http.Request), theResponse *http.Response, theError error) http.RoundTripper {
        return &roundTripCheckRequest{
                checkRequest: checkRequest,
                roundTrip: roundTrip{
                        theResponse: theResponse,
                        theError:    theError,
                },
        }
}

// The MIT License (MIT)

// Copyright (c) 2014 Ben Johnson

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

package testutil

import (
        "testing"

        "go.uber.org/goleak"
)

// TolerantVerifyLeak verifies go leaks but excludes the go routines that are
// launched as side effects of some of our dependencies.
func TolerantVerifyLeak(m *testing.M) {
        goleak.VerifyTestMain(m,
                // https://github.com/census-instrumentation/opencensus-go/blob/d7677d6af5953e0506ac4c08f349c62b917a443a/stats/view/worker.go#L34
                goleak.IgnoreTopFunction("go.opencensus.io/stats/view.(*worker).start"),
                // https://github.com/kubernetes/klog/blob/c85d02d1c76a9ebafa81eb6d35c980734f2c4727/klog.go#L417
                goleak.IgnoreTopFunction("k8s.io/klog/v2.(*loggingT).flushDaemon"),
                // This go routine uses a ticker to stop, so it can create false
                // positives.
                // https://github.com/kubernetes/client-go/blob/f6ce18ae578c8cca64d14ab9687824d9e1305a67/util/workqueue/queue.go#L201
                goleak.IgnoreTopFunction("k8s.io/client-go/util/workqueue.(*Type).updateUnfinishedWorkLoop"),
        )
}

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Package zeropool provides a zero-allocation type-safe alternative for sync.Pool, used to workaround staticheck SA6002.
// The contents of this package are brought from https://github.com/colega/zeropool because "little copying is better than little dependency".

package zeropool

import "sync"

// Pool is a type-safe pool of items that does not allocate pointers to items.
// That is not entirely true, it does allocate sometimes, but not most of the time,
// just like the usual sync.Pool pools items most of the time, except when they're evicted.
// It does that by storing the allocated pointers in a secondary pool instead of letting them go,
// so they can be used later to store the items again.
//
// Zero value of Pool[T] is valid, and it will return zero values of T if nothing is pooled.
type Pool[T any] struct {
        // items holds pointers to the pooled items, which are valid to be used.
        items sync.Pool
        // pointers holds just pointers to the pooled item types.
        // The values referenced by pointers are not valid to be used (as they're used by some other caller)
        // and it is safe to overwrite these pointers.
        pointers sync.Pool
}

// New creates a new Pool[T] with the given function to create new items.
// A Pool must not be copied after first use.
func New[T any](item func() T) Pool[T] {
        return Pool[T]{
                items: sync.Pool{
                        New: func() interface{} {
                                val := item()
                                return &val
                        },
                },
        }
}

// Get returns an item from the pool, creating a new one if necessary.
// Get may be called concurrently from multiple goroutines.
func (p *Pool[T]) Get() T {
        pooled := p.items.Get()
        if pooled == nil {
                // The only way this can happen is when someone is using the zero-value of zeropool.Pool, and items pool is empty.
                // We don't have a pointer to store in p.pointers, so just return the empty value.
                var zero T
                return zero
        }

        ptr := pooled.(*T)
        item := *ptr // ptr still holds a reference to a copy of item, but nobody will use it.
        p.pointers.Put(ptr)
        return item
}

// Put adds an item to the pool.
func (p *Pool[T]) Put(item T) {
        var ptr *T
        if pooled := p.pointers.Get(); pooled != nil {
                ptr = pooled.(*T)
        } else {
                ptr = new(T)
        }
        *ptr = item
        p.items.Put(ptr)
}